diff --git a/diffbot.py b/diffbot.py index e433eab..4c508ec 100644 --- a/diffbot.py +++ b/diffbot.py @@ -1,4 +1,4 @@ -"""Diffbot API wrapper.""" +"""Diffbot API wrapper. Edited Lee H""" import argparse import json import os @@ -30,10 +30,13 @@ def __init__(self, token, version=API_VERSION): self._version = version @staticmethod - def _get(url, params=None): + def _get(url, headers=None, params=None): """HTTP GET request.""" try: - response = requests.get(url, params=params) + if headers: + response = requests.get(url, headers=headers, params=params) + else: + response = response.get(url, params=params) response.raise_for_status() # If JSON fails, return raw data # (e.g. when downloading CSV job logs). @@ -46,24 +49,24 @@ def _get(url, params=None): return json.loads(urllib2.urlopen(url).read().decode(ENCODING)) @staticmethod - def _post(url, data, content_type, params=None): + def _post(url, data, headers=None, params=None): """HTTP POST request.""" try: - response = requests.post(url, params=params, data=data, headers={ - 'Content-Type': content_type, - }) + response = requests.post(url, params=params, data=data, headers=headers) response.raise_for_status() return response.json() except NameError: url = '{0}?{1}'.format(url, urllib.urlencode(params)) - req = urllib2.Request(url, data.encode(ENCODING), { - 'Content-Type': content_type, - }) + req = urllib2.Request(url, data.encode(ENCODING), headers) return json.loads(urllib2.urlopen(req).read().decode(ENCODING)) - def endpoint(self, name): + def endpoint(self, name, no_render=False): """Generate the URL endpoint for the given API.""" - return '{0}/v{1}/{2}'.format(API_ROOT, self._version, name) + endpoint_url = '{0}/v{1}/{2}'.format(API_ROOT, self._version, name) + if no_render: + # Turn off js to speed up processing + endpoint_url += '?norender' + return endpoint_url def api(self, name, url, **kwargs): """Generic API method.""" @@ -74,6 +77,7 @@ def api(self, name, url, **kwargs): timeout = kwargs.get('timeout') text = kwargs.get('text') html = kwargs.get('html') + headers = kwargs.get('headers') if text and html: raise ValueError(u'Both `text` and `html` arguments provided!') params = {'url': url, 'token': self._token} @@ -86,8 +90,12 @@ def api(self, name, url, **kwargs): url = self.endpoint(name) if text or html: content_type = html and 'text/html' or 'text/plain' - return self._post(url, text or html, content_type, params=params) - return self._get(url, params=params) + headers_cust = {'Content-Type': content_type} + if headers: + headers_cust = headers.copy() + headers_cust['Content-Type'] = content_type + return self._post(url, text or html, headers=headers_cust, params=params) + return self._get(url, headers=headers, params=params) def article(self, url, **kwargs): """Article API.""" @@ -122,13 +130,11 @@ def crawl(self, urls, name='crawl', api='analyze', **kwargs): if isinstance(urls, list): urls = ' '.join(urls) url = self.endpoint('crawl') - process_url = self.endpoint(api) - params = { - 'token': self._token, - 'seeds': urls, - 'name': name, - 'apiUrl': process_url, - } + process_url = self.endpoint(api, no_render=kwargs.get('no_render', False)) + params = {'token': self._token, + 'seeds': urls, + 'name': name, + 'apiUrl': process_url,} # Add any additional named parameters as accepted by Crawlbot params['maxToCrawl'] = 10 @@ -153,7 +159,7 @@ def __init__(self, token, name, version=API_VERSION): def control(self, **kwargs): params = {'token': self._token, 'name': self._name} params.update(kwargs) - res = self._get(self._url, params) + res = self._get(self._url, params=params) job = next(j for j in res['jobs'] if j['name'] == self._name) return job