From fbdda04ea23a2d7131c4f453780aac4593801958 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade <1324225+hugovk@users.noreply.github.com> Date: Thu, 7 May 2026 22:37:00 +0300 Subject: [PATCH 1/2] Revert "gh-106693: Explicitly mark ob_sval as unsigned char to avoid UB (#106826)" This reverts commit fbba343622c9e4a38c8ef0f0b0e311164394d76a. --- Include/cpython/bytesobject.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Include/cpython/bytesobject.h b/Include/cpython/bytesobject.h index 550b5fcb952e68..85bc2b827df8fb 100644 --- a/Include/cpython/bytesobject.h +++ b/Include/cpython/bytesobject.h @@ -5,7 +5,7 @@ typedef struct { PyObject_VAR_HEAD Py_DEPRECATED(3.11) Py_hash_t ob_shash; - unsigned char ob_sval[1]; + char ob_sval[1]; /* Invariants: * ob_sval contains space for 'ob_size+1' elements. @@ -20,7 +20,7 @@ PyAPI_FUNC(int) _PyBytes_Resize(PyObject **, Py_ssize_t); #define _PyBytes_CAST(op) \ (assert(PyBytes_Check(op)), _Py_CAST(PyBytesObject*, op)) -static inline unsigned char* PyBytes_AS_STRING(PyObject *op) +static inline char* PyBytes_AS_STRING(PyObject *op) { return _PyBytes_CAST(op)->ob_sval; } From fdf75b30a354d65df10fa1addf91c5b24d421cd8 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade <1324225+hugovk@users.noreply.github.com> Date: Thu, 7 May 2026 23:03:45 +0300 Subject: [PATCH 2/2] Revert "gh-79638: Treat an unreachable robots.txt as "disallow all" (GH-138555)" This reverts commit 310fe88994249a5a02e20d1211b8fc067e34aa78. --- Lib/test/test_robotparser.py | 66 +++++-------------- Lib/urllib/robotparser.py | 10 +-- ...5-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst | 2 - 3 files changed, 17 insertions(+), 61 deletions(-) delete mode 100644 Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 65bfe815705e0a..3ea0ec66fbfbe9 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -646,23 +646,26 @@ def test_group_without_user_agent(self): ) class BaseLocalNetworkTestCase: - @classmethod - def setUpClass(cls): + def setUp(self): # clear _opener global variable - cls.addClassCleanup(urllib.request.urlcleanup) + self.addCleanup(urllib.request.urlcleanup) - cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler) - cls.addClassCleanup(cls.server.server_close) + self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler) - t = threading.Thread( + self.t = threading.Thread( name='HTTPServer serving', - target=cls.server.serve_forever, + target=self.server.serve_forever, # Short poll interval to make the test finish quickly. # Time between requests is short enough that we won't wake # up spuriously too many times. kwargs={'poll_interval':0.01}) - cls.enterClassContext(threading_helper.start_threads([t])) - cls.addClassCleanup(cls.server.shutdown) + self.t.daemon = True # In case this function raises. + self.t.start() + + def tearDown(self): + self.server.shutdown() + self.t.join() + self.server.server_close() SAMPLE_ROBOTS_TXT = b'''\ @@ -684,6 +687,7 @@ def do_GET(self): def log_message(self, format, *args): pass + @threading_helper.reap_threads def testRead(self): # Test that reading a weird robots.txt doesn't fail. addr = self.server.server_address @@ -705,21 +709,17 @@ def testRead(self): self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path')) -class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase): +class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase): class RobotHandler(BaseHTTPRequestHandler): def do_GET(self): - self.send_error(self.server.return_code) + self.send_error(403, "Forbidden access") def log_message(self, format, *args): pass - def setUp(self): - # Make sure that a valid code is set in the test. - self.server.return_code = None - + @threading_helper.reap_threads def testPasswordProtectedSite(self): - self.server.return_code = 403 addr = self.server.server_address url = 'http://' + socket_helper.HOST + ':' + str(addr[1]) robots_url = url + "/robots.txt" @@ -727,40 +727,6 @@ def testPasswordProtectedSite(self): parser.set_url(url) parser.read() self.assertFalse(parser.can_fetch("*", robots_url)) - self.assertFalse(parser.can_fetch("*", url + '/some/file.html')) - - def testNotFound(self): - self.server.return_code = 404 - addr = self.server.server_address - url = f'http://{socket_helper.HOST}:{addr[1]}' - robots_url = url + "/robots.txt" - parser = urllib.robotparser.RobotFileParser() - parser.set_url(url) - parser.read() - self.assertTrue(parser.can_fetch("*", robots_url)) - self.assertTrue(parser.can_fetch("*", url + '/path/file.html')) - - def testTeapot(self): - self.server.return_code = 418 - addr = self.server.server_address - url = f'http://{socket_helper.HOST}:{addr[1]}' - robots_url = url + "/robots.txt" - parser = urllib.robotparser.RobotFileParser() - parser.set_url(url) - parser.read() - self.assertTrue(parser.can_fetch("*", robots_url)) - self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream')) - - def testServiceUnavailable(self): - self.server.return_code = 503 - addr = self.server.server_address - url = f'http://{socket_helper.HOST}:{addr[1]}' - robots_url = url + "/robots.txt" - parser = urllib.robotparser.RobotFileParser() - parser.set_url(url) - parser.read() - self.assertFalse(parser.can_fetch("*", robots_url)) - self.assertFalse(parser.can_fetch("*", url + '/path/file.html')) @support.requires_working_socket() diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 0c3e5d92890935..e70eae80036784 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -65,17 +65,9 @@ def read(self): f = urllib.request.urlopen(self.url) except urllib.error.HTTPError as err: if err.code in (401, 403): - # If access to robot.txt has the status Unauthorized/Forbidden, - # then most likely this applies to the entire site. self.disallow_all = True - elif 400 <= err.code < 500: - # RFC 9309, Section 2.3.1.3: the crawler MAY access any - # resources on the server. + elif err.code >= 400 and err.code < 500: self.allow_all = True - elif 500 <= err.code < 600: - # RFC 9309, Section 2.3.1.4: the crawler MUST assume - # complete disallow. - self.disallow_all = True err.close() else: raw = f.read() diff --git a/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst b/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst deleted file mode 100644 index bd9fff0bc2e31b..00000000000000 --- a/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst +++ /dev/null @@ -1,2 +0,0 @@ -Disallow all access in :mod:`urllib.robotparser` if the ``robots.txt`` file -is unreachable due to server or network errors.