From cf373db4055fcae1a157b93bf4ed64b5eefc0110 Mon Sep 17 00:00:00 2001 From: fpghost Date: Wed, 18 Apr 2018 12:52:05 +0100 Subject: [PATCH 1/5] init commit --- diffbot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diffbot.py b/diffbot.py index e433eab..da41ce8 100644 --- a/diffbot.py +++ b/diffbot.py @@ -1,4 +1,4 @@ -"""Diffbot API wrapper.""" +"""Diffbot API wrapper. Edited Lee H""" import argparse import json import os From 01fc68d3e701c62821999e39db73a107f3c065a0 Mon Sep 17 00:00:00 2001 From: fpghost Date: Wed, 18 Apr 2018 13:22:34 +0100 Subject: [PATCH 2/5] add ability to add custom headers --- diffbot.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/diffbot.py b/diffbot.py index da41ce8..d78aa96 100644 --- a/diffbot.py +++ b/diffbot.py @@ -30,10 +30,10 @@ def __init__(self, token, version=API_VERSION): self._version = version @staticmethod - def _get(url, params=None): + def _get(url, headers=None, params=None): """HTTP GET request.""" try: - response = requests.get(url, params=params) + response = requests.get(url, headers=headers, params=params) response.raise_for_status() # If JSON fails, return raw data # (e.g. when downloading CSV job logs). @@ -46,19 +46,15 @@ def _get(url, params=None): return json.loads(urllib2.urlopen(url).read().decode(ENCODING)) @staticmethod - def _post(url, data, content_type, params=None): + def _post(url, data, headers=None, params=None): """HTTP POST request.""" try: - response = requests.post(url, params=params, data=data, headers={ - 'Content-Type': content_type, - }) + response = requests.post(url, params=params, data=data, headers=headers) response.raise_for_status() return response.json() except NameError: url = '{0}?{1}'.format(url, urllib.urlencode(params)) - req = urllib2.Request(url, data.encode(ENCODING), { - 'Content-Type': content_type, - }) + req = urllib2.Request(url, data.encode(ENCODING), headers) return json.loads(urllib2.urlopen(req).read().decode(ENCODING)) def endpoint(self, name): @@ -74,6 +70,7 @@ def api(self, name, url, **kwargs): timeout = kwargs.get('timeout') text = kwargs.get('text') html = kwargs.get('html') + headers = kwargs.get('headers') if text and html: raise ValueError(u'Both `text` and `html` arguments provided!') params = {'url': url, 'token': self._token} @@ -86,8 +83,12 @@ def api(self, name, url, **kwargs): url = self.endpoint(name) if text or html: content_type = html and 'text/html' or 'text/plain' - return self._post(url, text or html, content_type, params=params) - return self._get(url, params=params) + headers_cust['Content-Type'] = content_type + if headers: + headers_cust = headers.copy() + headers_cust['Content-Type'] = content_type + return self._post(url, text or html, headers=headers_cust, params=params) + return self._get(url, headers=headers, params=params) def article(self, url, **kwargs): """Article API.""" From 30335083cdd13e8462f2d45036fd7802f7eb082a Mon Sep 17 00:00:00 2001 From: lee Date: Mon, 29 Jul 2019 12:43:41 +0200 Subject: [PATCH 3/5] allow no render param on the process url fix headers issue --- diffbot.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/diffbot.py b/diffbot.py index d78aa96..32e8486 100644 --- a/diffbot.py +++ b/diffbot.py @@ -57,9 +57,13 @@ def _post(url, data, headers=None, params=None): req = urllib2.Request(url, data.encode(ENCODING), headers) return json.loads(urllib2.urlopen(req).read().decode(ENCODING)) - def endpoint(self, name): + def endpoint(self, name, no_render=False): """Generate the URL endpoint for the given API.""" - return '{0}/v{1}/{2}'.format(API_ROOT, self._version, name) + endpoint_url = '{0}/v{1}/{2}'.format(API_ROOT, self._version, name) + if no_render: + # Turn off js to speed up processing + endpoint_url += '?norender' + return endpoint_url def api(self, name, url, **kwargs): """Generic API method.""" @@ -83,7 +87,7 @@ def api(self, name, url, **kwargs): url = self.endpoint(name) if text or html: content_type = html and 'text/html' or 'text/plain' - headers_cust['Content-Type'] = content_type + headers_cust = {'Content-Type': content_type} if headers: headers_cust = headers.copy() headers_cust['Content-Type'] = content_type @@ -123,13 +127,11 @@ def crawl(self, urls, name='crawl', api='analyze', **kwargs): if isinstance(urls, list): urls = ' '.join(urls) url = self.endpoint('crawl') - process_url = self.endpoint(api) - params = { - 'token': self._token, - 'seeds': urls, - 'name': name, - 'apiUrl': process_url, - } + process_url = self.endpoint(api, no_render=kwargs.get('no_render', False)) + params = {'token': self._token, + 'seeds': urls, + 'name': name, + 'apiUrl': process_url,} # Add any additional named parameters as accepted by Crawlbot params['maxToCrawl'] = 10 From f75476f24d4a7d697069fea20d70a51fe18d351a Mon Sep 17 00:00:00 2001 From: lee Date: Mon, 29 Jul 2019 18:06:38 +0200 Subject: [PATCH 4/5] don't pass headers at all if none --- diffbot.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/diffbot.py b/diffbot.py index 32e8486..91e5bce 100644 --- a/diffbot.py +++ b/diffbot.py @@ -33,7 +33,10 @@ def __init__(self, token, version=API_VERSION): def _get(url, headers=None, params=None): """HTTP GET request.""" try: - response = requests.get(url, headers=headers, params=params) + if headers: + response = requests.get(url, headers=headers, params=params) + else: + response = response.get(url, params=params) response.raise_for_status() # If JSON fails, return raw data # (e.g. when downloading CSV job logs). From 3ee8910dac6006b42be121e2e3c0de8deb0014e2 Mon Sep 17 00:00:00 2001 From: lee Date: Mon, 29 Jul 2019 18:42:33 +0200 Subject: [PATCH 5/5] params is kwarg now --- diffbot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diffbot.py b/diffbot.py index 91e5bce..4c508ec 100644 --- a/diffbot.py +++ b/diffbot.py @@ -159,7 +159,7 @@ def __init__(self, token, name, version=API_VERSION): def control(self, **kwargs): params = {'token': self._token, 'name': self._name} params.update(kwargs) - res = self._get(self._url, params) + res = self._get(self._url, params=params) job = next(j for j in res['jobs'] if j['name'] == self._name) return job