Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Include/cpython/bytesobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
typedef struct {
PyObject_VAR_HEAD
Py_DEPRECATED(3.11) Py_hash_t ob_shash;
unsigned char ob_sval[1];
char ob_sval[1];

/* Invariants:
* ob_sval contains space for 'ob_size+1' elements.
Expand All @@ -20,7 +20,7 @@ PyAPI_FUNC(int) _PyBytes_Resize(PyObject **, Py_ssize_t);
#define _PyBytes_CAST(op) \
(assert(PyBytes_Check(op)), _Py_CAST(PyBytesObject*, op))

static inline unsigned char* PyBytes_AS_STRING(PyObject *op)
static inline char* PyBytes_AS_STRING(PyObject *op)
{
return _PyBytes_CAST(op)->ob_sval;
}
Expand Down
66 changes: 16 additions & 50 deletions Lib/test/test_robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,23 +646,26 @@ def test_group_without_user_agent(self):
)
class BaseLocalNetworkTestCase:

@classmethod
def setUpClass(cls):
def setUp(self):
# clear _opener global variable
cls.addClassCleanup(urllib.request.urlcleanup)
self.addCleanup(urllib.request.urlcleanup)

cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler)
cls.addClassCleanup(cls.server.server_close)
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)

t = threading.Thread(
self.t = threading.Thread(
name='HTTPServer serving',
target=cls.server.serve_forever,
target=self.server.serve_forever,
# Short poll interval to make the test finish quickly.
# Time between requests is short enough that we won't wake
# up spuriously too many times.
kwargs={'poll_interval':0.01})
cls.enterClassContext(threading_helper.start_threads([t]))
cls.addClassCleanup(cls.server.shutdown)
self.t.daemon = True # In case this function raises.
self.t.start()

def tearDown(self):
self.server.shutdown()
self.t.join()
self.server.server_close()


SAMPLE_ROBOTS_TXT = b'''\
Expand All @@ -684,6 +687,7 @@ def do_GET(self):
def log_message(self, format, *args):
pass

@threading_helper.reap_threads
def testRead(self):
# Test that reading a weird robots.txt doesn't fail.
addr = self.server.server_address
Expand All @@ -705,62 +709,24 @@ def testRead(self):
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))


class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
class RobotHandler(BaseHTTPRequestHandler):

def do_GET(self):
self.send_error(self.server.return_code)
self.send_error(403, "Forbidden access")

def log_message(self, format, *args):
pass

def setUp(self):
# Make sure that a valid code is set in the test.
self.server.return_code = None

@threading_helper.reap_threads
def testPasswordProtectedSite(self):
self.server.return_code = 403
addr = self.server.server_address
url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertFalse(parser.can_fetch("*", robots_url))
self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))

def testNotFound(self):
self.server.return_code = 404
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertTrue(parser.can_fetch("*", robots_url))
self.assertTrue(parser.can_fetch("*", url + '/path/file.html'))

def testTeapot(self):
self.server.return_code = 418
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertTrue(parser.can_fetch("*", robots_url))
self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream'))

def testServiceUnavailable(self):
self.server.return_code = 503
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertFalse(parser.can_fetch("*", robots_url))
self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))


@support.requires_working_socket()
Expand Down
10 changes: 1 addition & 9 deletions Lib/urllib/robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,9 @@ def read(self):
f = urllib.request.urlopen(self.url)
except urllib.error.HTTPError as err:
if err.code in (401, 403):
# If access to robot.txt has the status Unauthorized/Forbidden,
# then most likely this applies to the entire site.
self.disallow_all = True
elif 400 <= err.code < 500:
# RFC 9309, Section 2.3.1.3: the crawler MAY access any
# resources on the server.
elif err.code >= 400 and err.code < 500:
self.allow_all = True
elif 500 <= err.code < 600:
# RFC 9309, Section 2.3.1.4: the crawler MUST assume
# complete disallow.
self.disallow_all = True
err.close()
else:
raw = f.read()
Expand Down

This file was deleted.

Loading