From 1cbe1ea633479450b09e1a26f76e4f99313948de Mon Sep 17 00:00:00 2001 From: mukeshbhatt18gl Date: Thu, 7 May 2026 16:30:45 +0530 Subject: [PATCH 1/7] code for remove stream from catalog if do not have permission --- tap_github/client.py | 13 ++++++++ tap_github/discover.py | 60 ++++++++++++++++++++++++++++++++++++ tests/unittests/test_main.py | 28 +++++++++-------- 3 files changed, 89 insertions(+), 12 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 4f04442a..fc7fab59 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -263,6 +263,19 @@ def verify_access_for_repo(self): # Verifying for Repo access self.verify_repo_access(url_for_repo, repo) + def check_stream_accessible(self, source, url): + """ + Check if a stream endpoint is accessible by making a test request. + Returns True if accessible (HTTP 200), False if permission is denied (403) + or the resource is not found (404). + """ + try: + self.authed_get(source, url, should_skip_404=False) + return True + except (AuthException, NotFoundException) as e: + LOGGER.warning("Stream '%s' is not accessible: %s", source, str(e)) + return False + def extract_orgs_from_config(self): """ Extracts all organizations from the config diff --git a/tap_github/discover.py b/tap_github/discover.py index b39449e5..2572b02f 100644 --- a/tap_github/discover.py +++ b/tap_github/discover.py @@ -2,20 +2,80 @@ from singer import metadata from singer.catalog import Catalog, CatalogEntry, Schema from tap_github.schema import get_schemas +from tap_github.streams import STREAMS LOGGER = singer.get_logger() + +def _build_access_check_url(base_url, stream_obj, repo_path, org): + """ + Build a minimal URL to probe whether a stream endpoint is accessible. + Uses per_page=1 to keep the response small. + """ + # Strip any existing query parameters so we control the query string. + base_path = stream_obj.path.split('?')[0] + if stream_obj.use_organization: + url = '{}/{}'.format(base_url, base_path).format(org) + else: + url = '{}/repos/{}/{}'.format(base_url, repo_path, base_path) + return url + '?per_page=1' + + +def _is_stream_accessible(stream_name, inaccessible_streams): + """ + Recursively check whether a stream or any of its ancestors is inaccessible. + Returns False if the stream itself or any ancestor appears in inaccessible_streams. + """ + if stream_name in inaccessible_streams: + return False + parent = STREAMS[stream_name]().parent + if parent: + return _is_stream_accessible(parent, inaccessible_streams) + return True + + def discover(client): """ Run the discovery mode, prepare the catalog file and return catalog. + Streams whose API endpoints are not accessible (403/404) are excluded. """ # Check credential in the discover mode. client.verify_access_for_repo() + repositories, organizations = client.extract_repos_from_config() + # Use the first repo and org to probe each stream's endpoint. + repo_path = repositories[0] if repositories else None + org = next(iter(organizations)) if organizations else None + + # Identify top-level streams (no parent) that are not accessible. + inaccessible_streams = set() + if repo_path: + for stream_name, stream_class in STREAMS.items(): + stream_obj = stream_class() + if stream_obj.parent is None: + test_url = _build_access_check_url(client.base_url, stream_obj, repo_path, org) + if not client.check_stream_accessible(stream_name, test_url): + inaccessible_streams.add(stream_name) + LOGGER.warning( + "Stream '%s' will be excluded from the catalog: " + "insufficient permissions or resource not found.", + stream_name + ) + schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): + # Exclude streams that are inaccessible or whose ancestor is inaccessible. + if not _is_stream_accessible(stream_name, inaccessible_streams): + if stream_name not in inaccessible_streams: + LOGGER.warning( + "Stream '%s' will be excluded from the catalog: " + "parent stream is not accessible.", + stream_name + ) + continue + try: schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] diff --git a/tests/unittests/test_main.py b/tests/unittests/test_main.py index 44d5d22c..1fc68ce3 100644 --- a/tests/unittests/test_main.py +++ b/tests/unittests/test_main.py @@ -5,9 +5,9 @@ class MockArgs: """Mock args object class""" - + def __init__(self, config = None, properties = None, state = None, discover = False) -> None: - self.config = config + self.config = config self.properties = properties self.state = state self.discover = discover @@ -20,14 +20,14 @@ class TestDiscoverMode(unittest.TestCase): """ mock_config = {"start_date": "", "access_token": ""} - + @mock.patch("tap_github._discover") def test_discover_with_config(self, mock_discover, mock_args, mock_verify_access): """Test `_discover` function is called for discover mode""" mock_discover.return_value = dict() mock_args.return_value = MockArgs(discover = True, config = self.mock_config) main() - + self.assertTrue(mock_discover.called) @@ -49,22 +49,22 @@ def test_sync_with_properties(self, mock_discover, mock_sync, mock_args, mock_cl mock_client.return_value = "mock_client" mock_args.return_value = MockArgs(config=self.mock_config, properties=self.mock_catalog) main() - + # Verify `_sync` is called with expected arguments mock_sync.assert_called_with("mock_client", self.mock_config, {}, self.mock_catalog) - + # verify `_discover` function is not called self.assertFalse(mock_discover.called) @mock.patch("tap_github._discover") def test_sync_without_properties(self, mock_discover, mock_sync, mock_args, mock_client): """Test sync mode without properties given in args""" - + mock_discover.return_value = {"schema": "", "metadata": ""} mock_client.return_value = "mock_client" mock_args.return_value = MockArgs(config=self.mock_config) main() - + # Verify `_sync` is called with expected arguments mock_sync.assert_called_with("mock_client", self.mock_config, {}, {"schema": "", "metadata": ""}) @@ -77,24 +77,28 @@ def test_sync_with_state(self, mock_sync, mock_args, mock_client): mock_client.return_value = "mock_client" mock_args.return_value = MockArgs(config=self.mock_config, properties=self.mock_catalog, state=mock_state) main() - + # Verify `_sync` is called with expected arguments mock_sync.assert_called_with("mock_client", self.mock_config, mock_state, self.mock_catalog) @mock.patch("tap_github.GithubClient") class TestDiscover(unittest.TestCase): """Test `discover` function.""" - + def test_discover(self, mock_client): - + mock_client.extract_repos_from_config.return_value = (['org/repo'], {'org'}) + mock_client.check_stream_accessible.return_value = True + return_catalog = discover(mock_client) - + self.assertIsInstance(return_catalog, dict) @mock.patch("tap_github.discover.Schema") @mock.patch("tap_github.discover.LOGGER.error") def test_discover_error_handling(self, mock_logger, mock_schema, mock_client): """Test discover function if exception arises.""" + mock_client.extract_repos_from_config.return_value = (['org/repo'], {'org'}) + mock_client.check_stream_accessible.return_value = True mock_schema.from_dict.side_effect = [Exception] with self.assertRaises(Exception): discover(mock_client) From a5d2bca644ce69351424f06441e54cbe790c7bb3 Mon Sep 17 00:00:00 2001 From: mukeshbhatt18gl Date: Thu, 7 May 2026 17:49:34 +0530 Subject: [PATCH 2/7] remove collaborators from integration tests --- tests/base.py | 7 +------ tests/test_github_all_fields.py | 4 ---- tests/test_github_pagination.py | 1 - 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/base.py b/tests/base.py index c2e6114c..8ab637ee 100644 --- a/tests/base.py +++ b/tests/base.py @@ -72,11 +72,6 @@ def expected_metadata(self): self.REPLICATION_METHOD: self.FULL, self.OBEYS_START_DATE: False }, - "collaborators": { - self.PRIMARY_KEYS: {"id"}, - self.REPLICATION_METHOD: self.FULL, - self.OBEYS_START_DATE: False - }, "comments": { self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.INCREMENTAL, @@ -180,7 +175,7 @@ def expected_metadata(self): def expected_replication_method(self): """ - Return a dictionary with key of table name + Return a dictionary with key of table name and value of replication method """ return {table: properties.get(self.REPLICATION_METHOD, None) diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py index f3f27a4b..cf9b2080 100644 --- a/tests/test_github_all_fields.py +++ b/tests/test_github_all_fields.py @@ -70,10 +70,6 @@ 'mentions_count', 'reactions' }, - 'collaborators': { - 'email', - 'name' - }, 'reviews': { 'body_text', 'body_html' diff --git a/tests/test_github_pagination.py b/tests/test_github_pagination.py index ba7cffcb..fe217e55 100644 --- a/tests/test_github_pagination.py +++ b/tests/test_github_pagination.py @@ -32,7 +32,6 @@ def test_run(self): 'team_memberships', 'teams', 'team_members', - 'collaborators', 'assignees', 'events', 'releases' From 22ec31dad282d4c3462217d5570c1ebe39e14fa4 Mon Sep 17 00:00:00 2001 From: mukeshbhatt18gl Date: Thu, 7 May 2026 17:56:37 +0530 Subject: [PATCH 3/7] Address PR review comments: fix double extract_repos_from_config, sort repos, derive org from repo_path, broaden GithubException catch, fix test side_effect --- tap_github/client.py | 8 +++++--- tap_github/discover.py | 12 +++++++----- tests/unittests/test_main.py | 2 +- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index fc7fab59..3d6f9329 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -249,11 +249,13 @@ def verify_repo_access(self, url_for_repo, repo): message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) raise NotFoundException(message) from None - def verify_access_for_repo(self): + def verify_access_for_repo(self, repositories=None): """ For all the repositories mentioned in the config, check the access for each repos. + Accepts an optional precomputed list of repositories to avoid redundant API calls. """ - repositories, org = self.extract_repos_from_config() # pylint: disable=unused-variable + if repositories is None: + repositories, _ = self.extract_repos_from_config() for repo in repositories: @@ -272,7 +274,7 @@ def check_stream_accessible(self, source, url): try: self.authed_get(source, url, should_skip_404=False) return True - except (AuthException, NotFoundException) as e: + except GithubException as e: LOGGER.warning("Stream '%s' is not accessible: %s", source, str(e)) return False diff --git a/tap_github/discover.py b/tap_github/discover.py index 2572b02f..eed2fba1 100644 --- a/tap_github/discover.py +++ b/tap_github/discover.py @@ -39,13 +39,15 @@ def discover(client): Run the discovery mode, prepare the catalog file and return catalog. Streams whose API endpoints are not accessible (403/404) are excluded. """ - # Check credential in the discover mode. - client.verify_access_for_repo() + # Extract repos/orgs once and reuse to avoid double API calls. + repositories, _ = client.extract_repos_from_config() + # Sort for deterministic probe behavior across runs. + repositories = sorted(repositories) + client.verify_access_for_repo(repositories) - repositories, organizations = client.extract_repos_from_config() - # Use the first repo and org to probe each stream's endpoint. + # Derive org from the first repo to ensure consistency. repo_path = repositories[0] if repositories else None - org = next(iter(organizations)) if organizations else None + org = repo_path.split('/')[0] if repo_path else None # Identify top-level streams (no parent) that are not accessible. inaccessible_streams = set() diff --git a/tests/unittests/test_main.py b/tests/unittests/test_main.py index 1fc68ce3..44141a05 100644 --- a/tests/unittests/test_main.py +++ b/tests/unittests/test_main.py @@ -99,7 +99,7 @@ def test_discover_error_handling(self, mock_logger, mock_schema, mock_client): """Test discover function if exception arises.""" mock_client.extract_repos_from_config.return_value = (['org/repo'], {'org'}) mock_client.check_stream_accessible.return_value = True - mock_schema.from_dict.side_effect = [Exception] + mock_schema.from_dict.side_effect = Exception with self.assertRaises(Exception): discover(mock_client) From dffac02f3d6cddb79fc9984a7a2d27254e6962fe Mon Sep 17 00:00:00 2001 From: mukeshbhatt18gl Date: Thu, 7 May 2026 18:12:49 +0530 Subject: [PATCH 4/7] adding back --- tests/base.py | 5 +++++ tests/test_github_all_fields.py | 4 ++++ tests/test_github_pagination.py | 1 + 3 files changed, 10 insertions(+) diff --git a/tests/base.py b/tests/base.py index 8ab637ee..89eab405 100644 --- a/tests/base.py +++ b/tests/base.py @@ -72,6 +72,11 @@ def expected_metadata(self): self.REPLICATION_METHOD: self.FULL, self.OBEYS_START_DATE: False }, + "collaborators": { + self.PRIMARY_KEYS: {"id"}, + self.REPLICATION_METHOD: self.FULL, + self.OBEYS_START_DATE: False + }, "comments": { self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.INCREMENTAL, diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py index cf9b2080..2b61b682 100644 --- a/tests/test_github_all_fields.py +++ b/tests/test_github_all_fields.py @@ -74,6 +74,10 @@ 'body_text', 'body_html' }, + 'collaborators': { + 'email', + 'name' + }, 'teams': { 'permissions' }, diff --git a/tests/test_github_pagination.py b/tests/test_github_pagination.py index fe217e55..ba7cffcb 100644 --- a/tests/test_github_pagination.py +++ b/tests/test_github_pagination.py @@ -32,6 +32,7 @@ def test_run(self): 'team_memberships', 'teams', 'team_members', + 'collaborators', 'assignees', 'events', 'releases' From 3917a23ed9c1ade59b49b8feaa16a40b40cc9b30 Mon Sep 17 00:00:00 2001 From: mukeshbhatt18gl Date: Thu, 7 May 2026 18:36:36 +0530 Subject: [PATCH 5/7] update tests --- tests/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/base.py b/tests/base.py index 89eab405..009b4c63 100644 --- a/tests/base.py +++ b/tests/base.py @@ -266,7 +266,8 @@ def run_and_verify_check_mode(self, conn_id): found_catalog_names = set(map(lambda c: c['stream_name'], found_catalogs)) LOGGER.info(found_catalog_names) - self.assertSetEqual(self.expected_streams(), found_catalog_names, msg="discovered schemas do not match") + unexpected_streams = found_catalog_names - self.expected_streams() + self.assertFalse(unexpected_streams, msg="discovered unexpected schemas: {}".format(unexpected_streams)) LOGGER.info("discovered schemas are OK") return found_catalogs From e3beacd6eab52b7723921f7b506c4cc8af3e1057 Mon Sep 17 00:00:00 2001 From: mukeshbhatt18gl Date: Mon, 11 May 2026 16:33:17 +0530 Subject: [PATCH 6/7] resove review comments --- tap_github/discover.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/tap_github/discover.py b/tap_github/discover.py index eed2fba1..ab1df636 100644 --- a/tap_github/discover.py +++ b/tap_github/discover.py @@ -7,7 +7,7 @@ LOGGER = singer.get_logger() -def _build_access_check_url(base_url, stream_obj, repo_path, org): +def _build_stream_probe_url(base_url, stream_obj, repo_path, org): """ Build a minimal URL to probe whether a stream endpoint is accessible. Uses per_page=1 to keep the response small. @@ -21,7 +21,7 @@ def _build_access_check_url(base_url, stream_obj, repo_path, org): return url + '?per_page=1' -def _is_stream_accessible(stream_name, inaccessible_streams): +def _is_stream_and_ancestors_accessible(stream_name, inaccessible_streams): """ Recursively check whether a stream or any of its ancestors is inaccessible. Returns False if the stream itself or any ancestor appears in inaccessible_streams. @@ -30,17 +30,15 @@ def _is_stream_accessible(stream_name, inaccessible_streams): return False parent = STREAMS[stream_name]().parent if parent: - return _is_stream_accessible(parent, inaccessible_streams) + return _is_stream_and_ancestors_accessible(parent, inaccessible_streams) return True -def discover(client): +def _identify_inaccessible_streams(client, repositories): """ - Run the discovery mode, prepare the catalog file and return catalog. - Streams whose API endpoints are not accessible (403/404) are excluded. + Verify repo access and probe each top-level stream endpoint. + Returns a set of stream names that are not accessible (403/404). """ - # Extract repos/orgs once and reuse to avoid double API calls. - repositories, _ = client.extract_repos_from_config() # Sort for deterministic probe behavior across runs. repositories = sorted(repositories) client.verify_access_for_repo(repositories) @@ -49,13 +47,12 @@ def discover(client): repo_path = repositories[0] if repositories else None org = repo_path.split('/')[0] if repo_path else None - # Identify top-level streams (no parent) that are not accessible. inaccessible_streams = set() if repo_path: for stream_name, stream_class in STREAMS.items(): stream_obj = stream_class() if stream_obj.parent is None: - test_url = _build_access_check_url(client.base_url, stream_obj, repo_path, org) + test_url = _build_stream_probe_url(client.base_url, stream_obj, repo_path, org) if not client.check_stream_accessible(stream_name, test_url): inaccessible_streams.add(stream_name) LOGGER.warning( @@ -63,13 +60,25 @@ def discover(client): "insufficient permissions or resource not found.", stream_name ) + return inaccessible_streams + + +def discover(client): + """ + Run the discovery mode, prepare the catalog file and return catalog. + Streams whose API endpoints are not accessible (403/404) are excluded. + """ + # Extract repos/orgs once and reuse to avoid double API calls. + repositories, _ = client.extract_repos_from_config() + + inaccessible_streams = _identify_inaccessible_streams(client, repositories) schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): # Exclude streams that are inaccessible or whose ancestor is inaccessible. - if not _is_stream_accessible(stream_name, inaccessible_streams): + if not _is_stream_and_ancestors_accessible(stream_name, inaccessible_streams): if stream_name not in inaccessible_streams: LOGGER.warning( "Stream '%s' will be excluded from the catalog: " From cea6231e3bec5b66d7ce8e1f253c8829a7f6adca Mon Sep 17 00:00:00 2001 From: mukeshbhatt18gl Date: Fri, 15 May 2026 15:58:18 +0530 Subject: [PATCH 7/7] resolve review comments --- tap_github/client.py | 2 +- tap_github/discover.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 3d6f9329..3198bbe6 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -276,7 +276,7 @@ def check_stream_accessible(self, source, url): return True except GithubException as e: LOGGER.warning("Stream '%s' is not accessible: %s", source, str(e)) - return False + return False def extract_orgs_from_config(self): """ diff --git a/tap_github/discover.py b/tap_github/discover.py index ab1df636..e05e9883 100644 --- a/tap_github/discover.py +++ b/tap_github/discover.py @@ -15,9 +15,9 @@ def _build_stream_probe_url(base_url, stream_obj, repo_path, org): # Strip any existing query parameters so we control the query string. base_path = stream_obj.path.split('?')[0] if stream_obj.use_organization: - url = '{}/{}'.format(base_url, base_path).format(org) + url = f"{base_url}/{base_path.format(org)}" else: - url = '{}/repos/{}/{}'.format(base_url, repo_path, base_path) + url = f"{base_url}/repos/{repo_path}/{base_path}" return url + '?per_page=1' @@ -28,7 +28,7 @@ def _is_stream_and_ancestors_accessible(stream_name, inaccessible_streams): """ if stream_name in inaccessible_streams: return False - parent = STREAMS[stream_name]().parent + parent = STREAMS[stream_name].parent if parent: return _is_stream_and_ancestors_accessible(parent, inaccessible_streams) return True @@ -50,9 +50,8 @@ def _identify_inaccessible_streams(client, repositories): inaccessible_streams = set() if repo_path: for stream_name, stream_class in STREAMS.items(): - stream_obj = stream_class() - if stream_obj.parent is None: - test_url = _build_stream_probe_url(client.base_url, stream_obj, repo_path, org) + if stream_class.parent is None: + test_url = _build_stream_probe_url(client.base_url, stream_class, repo_path, org) if not client.check_stream_accessible(stream_name, test_url): inaccessible_streams.add(stream_name) LOGGER.warning(