EventRegistry
diff --git a/‎CHANGELOG.md
Lines changed: 16 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 16 additions & 0 deletions
diff --git a/‎eventregistry/Analytics.py
Lines changed: 46 additions & 20 deletions b/‎eventregistry/Analytics.py
Lines changed: 46 additions & 20 deletions
diff --git a/‎eventregistry/EventRegistry.py
Lines changed: 12 additions & 4 deletions b/‎eventregistry/EventRegistry.py
Lines changed: 12 additions & 4 deletions
diff --git a/‎eventregistry/QueryArticles.py
Lines changed: 20 additions & 10 deletions b/‎eventregistry/QueryArticles.py
Lines changed: 20 additions & 10 deletions
@@ -1,5 +1,21 @@
 # Change Log
 
+## [v8.6]() (2019-02-22)
+
+**Added**
+- We added sentiment, which can now be used in querying of articles and events. The `QueryArticles`, `QueryArticlesIter`, `QueryEvents`, `QueryEventsIter` constructors now all have additional parameters `minSentiment` and `maxSentiment` that can be used to filter the articles and events. The valid values are between -1 (very negative sentiment) and 1 (very positive sentiment). Value 0 represents neutral sentiment.
+- Sentiment was also added as a property in the returned articles and events.
+
+**Updated**
+
+- Analytics: We updated `trainTopicOnTweets()`, `trainTopicClearTopic()` and `trainTopicGetTrainedTopic()` methods in the `Analytics` class.
+- `QueryArticles.initWithComplexQuery()` was updated - the parameter `dataType` was removed (since the `dataType` value should be provided in the `$filter` section of the query)
+- `TopicPage` now supports setting also the source rank percentile
+- `Analytics.annotate()` method now supports passing custom parameters that should be used when annotating the text.
+- `Analytics.extractArticleInfo()` now also supports passing headers and cookies to be used when extracting article info from url.
+- Changed some defaults in the returned data. When searching articles, we now by default return article image and sentiment.
+
+
 ## [v8.5]() (2018-08-29)
 
 **Added**
 
@@ -10,6 +10,7 @@
 NOTE: the functionality is currently in BETA. The API calls or the provided outputs may change in the future.
 """
 
+import json
 from eventregistry.Base import *
 from eventregistry.ReturnInfo import *
 
@@ -21,14 +22,18 @@ def __init__(self, eventRegistry):
         self._er = eventRegistry
 
 
-    def annotate(self, text, lang = None):
+    def annotate(self, text, lang = None, customParams = None):
         """
         identify the list of entities and nonentities mentioned in the text
         @param text: input text to annotate
         @param lang: language of the provided document (can be an ISO2 or ISO3 code). If None is provided, the language will be automatically detected
+        @param customParams: None or a dict with custom parameters to send to the annotation service
         @returns: dict
         """
-        return self._er.jsonRequestAnalytics("/api/v1/annotate", { "lang": lang, "text": text })
+        params = {"lang": lang, "text": text}
+        if customParams:
+            params.update(customParams)
+        return self._er.jsonRequestAnalytics("/api/v1/annotate", params)
 
 
     def categorize(self, text, taxonomy = "dmoz"):
@@ -75,17 +80,27 @@ def detectLanguage(self, text):
         return self._er.jsonRequestAnalytics("/api/v1/detectLanguage", { "text": text })
 
 
-    def extractArticleInfo(self, url, proxyUrl = None):
+    def extractArticleInfo(self, url, proxyUrl = None, headers = None, cookies = None):
         """
         extract all available information about an article available at url `url`. Returned information will include
         article title, body, authors, links in the articles, ...
         @param url: article url to extract article information from
         @param proxyUrl: proxy that should be used for downloading article information. format: {schema}://{username}:{pass}@{proxy url/ip}
+        @param headers: dict with headers to set in the request (optional)
+        @param cookies: dict with cookies to set in the request (optional)
         @returns: dict
         """
         params = { "url": url }
         if proxyUrl:
             params["proxyUrl"] = proxyUrl
+        if headers:
+            if isinstance(headers, dict):
+                headers = json.dumps(headers)
+            params["headers"] = headers
+        if cookies:
+            if isinstance(cookies, dict):
+                cookies = json.dumps(cookies)
+            params["cookies"] = cookies
         return self._er.jsonRequestAnalytics("/api/v1/extractArticleInfo", params)
 
 
@@ -98,24 +113,34 @@ def ner(self, text):
         return self._er.jsonRequestAnalytics("/api/v1/ner", {"text": text})
 
 
-    def trainTopicOnTweets(self, twitterQuery, useTweetText = True, maxConcepts = 20, maxCategories = 10, maxTweets = 2000, notifyEmailAddress = None):
+    def trainTopicOnTweets(self, twitterQuery, useTweetText=True, useIdfNormalization=True,
+            normalization="linear", maxTweets=2000, maxUsedLinks=500, ignoreConceptTypes=[],
+            maxConcepts = 20, maxCategories = 10, notifyEmailAddress = None):
         """
         create a new topic and train it using the tweets that match the twitterQuery
         @param twitterQuery: string containing the content to search for. It can be a Twitter user account (using "@" prefix or user's Twitter url),
                 a hash tag (using "#" prefix) or a regular keyword.
         @param useTweetText: do you want to analyze the content of the tweets and extract the concepts mentioned in them? If False, only content shared
             in the articles in the user's tweets will be analyzed
+        @param useIdfNormalization: normalize identified concepts by their IDF in the news (punish very common concepts)
+        @param normalization: way to normalize the concept weights ("none", "linear")
+        @param maxTweets: maximum number of tweets to collect (default 2000, max 5000)
+        @param maxUsedLinks: maximum number of article links in the tweets to analyze (default 500, max 2000)
+        @param ignoreConceptTypes: what types of concepts you would like to ignore in the profile. options: person, org, loc, wiki or an array with those
         @param maxConcepts: the number of concepts to save in the final topic
         @param maxCategories: the number of categories to save in the final topic
         @param maxTweets: the maximum number of tweets to collect for the user to analyze
         @param notifyEmailAddress: when finished, should we send a notification email to this address?
         """
         assert maxTweets < 5000, "we can analyze at most 5000 tweets"
-        params = {"twitterQuery": twitterQuery,
-            "useTweetText": useTweetText, "maxConcepts": maxConcepts, "maxCategories": maxCategories,
-            "maxTweets": maxTweets}
+        params = {"twitterQuery": twitterQuery, "useTweetText": useTweetText,
+            "useIdfNormalization": useIdfNormalization, "normalization": normalization,
+            "maxTweets": maxTweets, "maxUsedLinks": maxUsedLinks,
+            "maxConcepts": maxConcepts, "maxCategories": maxCategories }
         if notifyEmailAddress:
             params["notifyEmailAddress"] = notifyEmailAddress
+        if len(ignoreConceptTypes) > 0:
+            params["ignoreConceptTypes"] = ignoreConceptTypes
         return self._er.jsonRequestAnalytics("/api/v1/trainTopicOnTwitter", params)
 
 
@@ -127,31 +152,32 @@ def trainTopicCreateTopic(self, name):
         return self._er.jsonRequestAnalytics("/api/v1/trainTopic", { "action": "createTopic", "name": name})
 
 
-    def trainTopicAddDocument(self, uri, text):
+    def trainTopicClearTopic(self, uri):
         """
-        add the information extracted from the provided "text" to the topic with uri "uri"
-        @param uri: uri of the topic (obtained by calling trainTopicCreateTopic method)
-        @param text: text to analyze and extract information from
+        if the topic is already existing, clear the definition of the topic. Use this if you want to retrain an existing topic
+        @param uri: uri of the topic (obtained by calling trainTopicCreateTopic method) to clear
         """
-        return self._er.jsonRequestAnalytics("/api/v1/trainTopic", { "action": "addDocument", "uri": uri, "text": text})
+        return self._er.jsonRequestAnalytics("/api/v1/trainTopic", { "action": "clearTopic", "uri": uri })
 
 
-    def trainTopicFinishTraining(self, uri, maxConcepts = 20, maxCategories = 10, idfNormalization = True):
+    def trainTopicAddDocument(self, uri, text):
         """
         add the information extracted from the provided "text" to the topic with uri "uri"
         @param uri: uri of the topic (obtained by calling trainTopicCreateTopic method)
-        @param maxConcepts: number of top concepts to save in the topic
-        @param maxCategories: number of top categories to save in the topic
-        @param idfNormalization: should the concepts be normalized by punishing the commonly mentioned concepts
-        @param returns: returns the trained topic: { concepts: [], categories: [] }
+        @param text: text to analyze and extract information from
         """
-        return self._er.jsonRequestAnalytics("/api/v1/trainTopic", {"action": "finishTraining", "uri": uri, "maxConcepts": maxConcepts, "maxCategories": maxCategories, "idfNormalization": idfNormalization})
+        return self._er.jsonRequestAnalytics("/api/v1/trainTopic", { "action": "addDocument", "uri": uri, "text": text})
 
 
-    def trainTopicGetTrainedTopic(self, uri):
+    def trainTopicGetTrainedTopic(self, uri, maxConcepts = 20, maxCategories = 10,
+            ignoreConceptTypes=[], idfNormalization = True):
         """
         retrieve topic for the topic for which you have already finished training
         @param uri: uri of the topic (obtained by calling trainTopicCreateTopic method)
+        @param maxConcepts: number of top concepts to retrieve in the topic
+        @param maxCategories: number of top categories to retrieve in the topic
+        @param ignoreConceptTypes: what types of concepts you would like to ignore in the profile. options: person, org, loc, wiki or an array with those
+        @param idfNormalization: should the concepts be normalized by punishing the commonly mentioned concepts
         @param returns: returns the trained topic: { concepts: [], categories: [] }
         """
-        return self._er.jsonRequestAnalytics("/api/v1/trainTopic", { "action": "getTrainedTopic", "uri": uri })
+        return self._er.jsonRequestAnalytics("/api/v1/trainTopic", { "action": "getTrainedTopic", "uri": uri, "maxConcepts": maxConcepts, "maxCategories": maxCategories, "idfNormalization": idfNormalization })
@@ -165,7 +165,10 @@ def getUrl(self, query):
         # don't modify original query params
         allParams = query._getQueryParams()
         # make the url
-        url = self._host + query._getPath() + "?" + urllib.urlencode(allParams, doseq=True)
+        try:
+            url = self._host + query._getPath() + "?" + urllib.urlencode(allParams, doseq=True)
+        except:
+            url = self._host + query._getPath() + "?" + urllib.parse.urlencode(allParams, doseq=True)
         return url
 
 
@@ -234,7 +237,7 @@ def jsonRequest(self, methodUrl, paramDict, customLogFName = None, allowUseOfArc
                 with open(customLogFName or self._requestLogFName, "a") as log:
                     if paramDict != None:
                         log.write("# " + json.dumps(paramDict) + "\n")
-                    log.write(methodUrl + "\n")
+                    log.write(methodUrl + "\n\n")
             except Exception as ex:
                 self._lastException = ex
 
@@ -292,6 +295,7 @@ def jsonRequest(self, methodUrl, paramDict, customLogFName = None, allowUseOfArc
                 # in case of invalid input parameters, don't try to repeat the search
                 if respInfo != None and respInfo.status_code == 530:
                     break
+                print("The request will be automatically repeated in 3 seconds...")
                 time.sleep(3)   # sleep for X seconds on error
         self._lock.release()
         if returnData == None:
@@ -327,9 +331,13 @@ def jsonRequestAnalytics(self, methodUrl, paramDict):
                 break
             except Exception as ex:
                 self._lastException = ex
-                print("Event Registry exception while executing the request:")
+                print("Event Registry Analytics exception while executing the request:")
                 self.printLastException()
-                break
+                # in case of invalid input parameters, don't try to repeat the action
+                if respInfo != None and respInfo.status_code == 530:
+                    print("The request will not be repeated since we received a response code 530")
+                    break
+                print("The request will be automatically repeated in 3 seconds...")
                 time.sleep(3)   # sleep for X seconds on error
         self._lock.release()
         if returnData == None:
 
@@ -37,6 +37,8 @@ def __init__(self,
                 eventFilter = "keepAll",
                 startSourceRankPercentile = 0,
                 endSourceRankPercentile = 100,
+                minSentiment = -1,
+                maxSentiment = 1,
                 dataType = "news",
                 requestedResult = None):
         """
@@ -103,6 +105,10 @@ def __init__(self,
                 "keepAll" (no filtering, default)
         @param startSourceRankPercentile: starting percentile of the sources to consider in the results (default: 0). Value should be in range 0-90 and divisible by 10.
         @param endSourceRankPercentile: ending percentile of the sources to consider in the results (default: 100). Value should be in range 10-100 and divisible by 10.
+        @param minSentiment: minimum value of the sentiment, that the returned articles should have. Range [-1, 1]. Note: setting the value will remove all articles that don't have
+                a computed value for the sentiment (all non-English articles)
+        @param maxSentiment: maximum value of the sentiment, that the returned articles should have. Range [-1, 1]. Note: setting the value will remove all articles that don't have
+                a computed value for the sentiment (all non-English articles)
         @param dataType: what data types should we search? "news" (news content, default), "pr" (press releases), or "blog".
                 If you want to use multiple data types, put them in an array (e.g. ["news", "pr"])
         @param requestedResult: the information to return as the result of the query. By default return the list of matching articles
@@ -160,6 +166,12 @@ def __init__(self,
             self._setVal("startSourceRankPercentile", startSourceRankPercentile)
         if endSourceRankPercentile != 100:
             self._setVal("endSourceRankPercentile", endSourceRankPercentile)
+        if minSentiment != -1:
+            assert minSentiment >= -1 and minSentiment <= 1
+            self._setVal("minSentiment", minSentiment)
+        if maxSentiment != 1:
+            assert maxSentiment >= -1 and maxSentiment <= 1
+            self._setVal("maxSentiment", maxSentiment)
         # always set the data type
         self._setVal("dataType", dataType)
 
@@ -244,7 +256,7 @@ def count(self, eventRegistry):
     def execQuery(self, eventRegistry,
                   sortBy = "rel",
                   sortByAsc = False,
-                  returnInfo = ReturnInfo(),
+                  returnInfo = None,
                   maxItems = -1,
                   **kwargs):
         """
@@ -270,15 +282,11 @@ def execQuery(self, eventRegistry,
 
 
     @staticmethod
-    def initWithComplexQuery(query, dataType = "news"):
+    def initWithComplexQuery(query):
         """
         @param query: complex query as ComplexArticleQuery instance, string or a python dict
-        @param dataType: what data types should we search? "news" (news content, default), "pr" (press releases), or "blog".
-                If you want to use multiple data types, put them in an array (e.g. ["news", "pr"])
         """
         q = QueryArticlesIter()
-        # set data type
-        q._setVal("dataType", dataType)
 
         # provided an instance of ComplexArticleQuery
         if isinstance(query, ComplexArticleQuery):
@@ -360,7 +368,7 @@ def __init__(self,
                  page = 1,
                  count = 100,
                  sortBy = "date", sortByAsc = False,
-                 returnInfo = ReturnInfo()):
+                 returnInfo = None):
         """
         return article details for resulting articles
         @param page: page of the articles to return
@@ -376,7 +384,8 @@ def __init__(self,
         self.articlesCount = count
         self.articlesSortBy = sortBy
         self.articlesSortByAsc = sortByAsc
-        self.__dict__.update(returnInfo.getParams("articles"))
+        if returnInfo != None:
+            self.__dict__.update(returnInfo.getParams("articles"))
 
 
     def setPage(self, page):
@@ -597,7 +606,7 @@ def __init__(self,
                  updatesUntilTm = None,
                  updatesUntilMinsAgo = None,
                  mandatorySourceLocation = False,
-                 returnInfo = ReturnInfo()):
+                 returnInfo = None):
         """
         get the list of articles that were recently added to the Event Registry and match the selected criteria
         @param maxArticleCount: the maximum number of articles to return in the call (the number can be even higher than 100 but in case more articles
@@ -624,4 +633,5 @@ def __init__(self,
             self.recentActivityArticlesUpdatesUntilMinsAgo = updatesUntilMinsAgo
         self.recentActivityArticlesMaxArticleCount = maxArticleCount
         self.recentActivityArticlesMandatorySourceLocation = mandatorySourceLocation
-        self.__dict__.update(returnInfo.getParams("recentActivityArticles"))
+        if returnInfo != None:
+            self.__dict__.update(returnInfo.getParams("recentActivityArticles"))