Skip to content

Commit a66d9e7

Browse files
Changes to fix Enterprise UI filtering of Github Hosted Scanner Repositories to Include (#4430)
* Changes to fix Enterprise UI filtering of Github Hosted Scanner Repositories to Include * trying to pass linting isssues * trying to solve linting errors * Added upstream repo filtering, improvements to normalizeRepo, and unit tests for both. * Fixing tests impacted by PR 4426
1 parent 2114e77 commit a66d9e7

File tree

3 files changed

+112
-10
lines changed

3 files changed

+112
-10
lines changed

pkg/sources/github/github.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,11 @@ func (c *filteredRepoCache) includeRepo(s string) bool {
205205
return false
206206
}
207207

208+
// wantRepo returns true if the repository should be included based on include/exclude patterns
209+
func (c *filteredRepoCache) wantRepo(s string) bool {
210+
return !c.ignoreRepo(s) && c.includeRepo(s)
211+
}
212+
208213
// Init returns an initialized GitHub source.
209214
func (s *Source) Init(aCtx context.Context, name string, jobID sources.JobID, sourceID sources.SourceID, verify bool, connection *anypb.Any, concurrency int) error {
210215
err := git.CmdCheck()
@@ -433,14 +438,29 @@ func (s *Source) Enumerate(ctx context.Context, reporter sources.UnitReporter) e
433438
// Double make sure that all enumerated repositories in the
434439
// filteredRepoCache have an entry in the repoInfoCache.
435440
for _, repo := range s.filteredRepoCache.Values() {
436-
ctx := context.WithValue(ctx, "repo", repo)
441+
// Extract the repository name from the URL for filtering
442+
repoName := repo
443+
if strings.Contains(repo, "/") {
444+
// Try to extract org/repo name from URL
445+
if strings.Contains(repo, "github.com") {
446+
parts := strings.Split(repo, "/")
447+
if len(parts) >= 2 {
448+
repoName = parts[len(parts)-2] + "/" + strings.TrimSuffix(parts[len(parts)-1], ".git")
449+
}
450+
}
451+
}
437452

438-
repo, err := s.ensureRepoInfoCache(ctx, repo, &unitErrorReporter{reporter})
439-
if err != nil {
440-
ctx.Logger().Error(err, "error caching repo info")
441-
_ = dedupeReporter.UnitErr(ctx, fmt.Errorf("error caching repo info: %w", err))
453+
// Final filter check - only include repositories that pass the filter
454+
if s.filteredRepoCache.wantRepo(repoName) {
455+
ctx = context.WithValue(ctx, "repo", repo)
456+
457+
repo, err := s.ensureRepoInfoCache(ctx, repo, &unitErrorReporter{reporter})
458+
if err != nil {
459+
ctx.Logger().Error(err, "error caching repo info")
460+
_ = dedupeReporter.UnitErr(ctx, fmt.Errorf("error caching repo info: %w", err))
461+
}
462+
s.repos = append(s.repos, repo)
442463
}
443-
s.repos = append(s.repos, repo)
444464
}
445465
githubReposEnumerated.WithLabelValues(s.name).Set(float64(len(s.repos)))
446466
ctx.Logger().Info("Completed enumeration", "num_repos", len(s.repos), "num_orgs", s.orgsCache.Count(), "num_members", len(s.memberCache))

pkg/sources/github/github_test.go

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,41 @@ func TestNormalizeRepos(t *testing.T) {
423423
}
424424
}
425425

426+
func TestNormalizeRepo(t *testing.T) {
427+
// Test that normalizeRepo correctly identifies URLs with protocols
428+
source := &Source{}
429+
430+
// Test case 1: HTTP URL
431+
result, err := source.normalizeRepo("https://github.com/org/repo.git")
432+
assert.NoError(t, err)
433+
assert.Contains(t, result, "github.com/org/repo")
434+
435+
// Test case 2: HTTP URL without .git
436+
result, err = source.normalizeRepo("http://github.com/org/repo")
437+
assert.NoError(t, err)
438+
assert.Contains(t, result, "github.com/org/repo")
439+
440+
// Test case 3: Git protocol URL
441+
result, err = source.normalizeRepo("git://github.com/org/repo.git")
442+
assert.NoError(t, err)
443+
assert.Contains(t, result, "github.com/org/repo")
444+
445+
// Test case 4: SSH URL
446+
result, err = source.normalizeRepo("ssh://[email protected]/org/repo.git")
447+
assert.NoError(t, err)
448+
assert.Contains(t, result, "github.com/org/repo")
449+
450+
// Test case 5: Org/repo format (should convert to full URL)
451+
result, err = source.normalizeRepo("org/repo")
452+
assert.NoError(t, err)
453+
assert.Contains(t, result, "github.com/org/repo")
454+
455+
// Test case 6: Invalid format (no protocol, no slash)
456+
_, err = source.normalizeRepo("invalid")
457+
assert.Error(t, err)
458+
assert.Contains(t, err.Error(), "no repositories found")
459+
}
460+
426461
func TestHandleRateLimit(t *testing.T) {
427462
s := initTestSource(&sourcespb.GitHub{Credential: &sourcespb.GitHub_Unauthenticated{}})
428463
ctx := context.Background()
@@ -493,7 +528,7 @@ func TestEnumerateWithToken(t *testing.T) {
493528
JSON(map[string]string{"login": "super-secret-user"})
494529

495530
gock.New("https://api.github.com").
496-
Get("/users/super-secret-user/repos").
531+
Get("/user/repos").
497532
MatchParam("per_page", "100").
498533
Reply(200).
499534
JSON([]map[string]string{{"clone_url": "https://github.com/super-secret-user/super-secret-repo.git", "full_name": "super-secret-user/super-secret-repo"}})
@@ -574,7 +609,7 @@ func TestEnumerate(t *testing.T) {
574609

575610
//
576611
gock.New("https://api.github.com").
577-
Get("/users/super-secret-user/repos").
612+
Get("/user/repos").
578613
Reply(200).
579614
JSON(`[{"name": "super-secret-repo", "full_name": "super-secret-user/super-secret-repo", "owner": {"login": "super-secret-user"}, "clone_url": "https://github.com/super-secret-user/super-secret-repo.git", "has_wiki": false, "size": 1}]`)
580615

@@ -978,6 +1013,39 @@ func Test_ScanMultipleTargets_MultipleErrors(t *testing.T) {
9781013
}
9791014
}
9801015

1016+
func TestRepositoryFiltering(t *testing.T) {
1017+
// Test that the filteredRepoCache correctly filters repositories
1018+
source := &Source{}
1019+
1020+
// Test case 1: No filters specified (should include everything)
1021+
cache1 := source.newFilteredRepoCache(context.Background(), simple.NewCache[string](), []string{}, []string{})
1022+
assert.True(t, cache1.wantRepo("org/repo1"))
1023+
assert.True(t, cache1.wantRepo("org/repo2"))
1024+
assert.True(t, cache1.wantRepo("org/repo3"))
1025+
1026+
// Test case 2: Include filter specified (should only include matching repos)
1027+
cache2 := source.newFilteredRepoCache(context.Background(), simple.NewCache[string](), []string{"org/repo1", "org/repo2"}, []string{})
1028+
assert.True(t, cache2.wantRepo("org/repo1"))
1029+
assert.True(t, cache2.wantRepo("org/repo2"))
1030+
assert.False(t, cache2.wantRepo("org/repo3"))
1031+
1032+
// Test case 3: Exclude filter specified (should exclude matching repos)
1033+
cache3 := source.newFilteredRepoCache(context.Background(), simple.NewCache[string](), []string{}, []string{"org/repo1"})
1034+
assert.False(t, cache3.wantRepo("org/repo1"))
1035+
assert.True(t, cache3.wantRepo("org/repo2"))
1036+
assert.True(t, cache3.wantRepo("org/repo3"))
1037+
1038+
// Test case 4: Both include and exclude filters (exclude takes precedence)
1039+
cache4 := source.newFilteredRepoCache(context.Background(), simple.NewCache[string](), []string{"org/repo1"}, []string{"org/repo1"})
1040+
assert.False(t, cache4.wantRepo("org/repo1"))
1041+
1042+
// Test case 5: Wildcard patterns
1043+
cache5 := source.newFilteredRepoCache(context.Background(), simple.NewCache[string](), []string{"org/*"}, []string{})
1044+
assert.True(t, cache5.wantRepo("org/repo1"))
1045+
assert.True(t, cache5.wantRepo("org/repo2"))
1046+
assert.False(t, cache5.wantRepo("other/repo1"))
1047+
}
1048+
9811049
func noopReporter() sources.UnitReporter {
9821050
return sources.VisitorReporter{
9831051
VisitUnit: func(context.Context, sources.SourceUnit) error {

pkg/sources/github/repo.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"io"
77
"net/http"
8+
"regexp"
89
"strings"
910
"sync"
1011

@@ -273,6 +274,12 @@ func (s *Source) processRepos(ctx context.Context, target string, reporter sourc
273274
}
274275

275276
repoName, repoURL := r.GetFullName(), r.GetCloneURL()
277+
278+
// Check if we should process this repository based on the filter
279+
if !s.filteredRepoCache.wantRepo(repoName) {
280+
continue
281+
}
282+
276283
s.totalRepoSize += r.GetSize()
277284
s.filteredRepoCache.Set(repoName, repoURL)
278285
s.cacheRepoInfo(r)
@@ -349,10 +356,17 @@ func (s *Source) wikiIsReachable(ctx context.Context, repoURL string) bool {
349356

350357
// normalizeRepo normalizes a GitHub repository URL or name to its canonical form.
351358
func (s *Source) normalizeRepo(repo string) (string, error) {
352-
// if the string contains a '/', assume it's a GitHub repository.
353-
if strings.ContainsRune(repo, '/') {
359+
360+
// If it's a full URL (has protocol), normalize it
361+
if regexp.MustCompile(`^[a-z]+://`).MatchString(repo) {
362+
354363
return giturl.NormalizeGithubRepo(repo)
355364
}
365+
// If it's a repository name (contains / but not http), convert to full URL first
366+
if strings.Contains(repo, "/") && !regexp.MustCompile(`^[a-z]+://`).MatchString(repo) {
367+
fullURL := "https://github.com/" + repo
368+
return giturl.NormalizeGithubRepo(fullURL)
369+
}
356370

357371
return "", fmt.Errorf("no repositories found for %s", repo)
358372
}

0 commit comments

Comments
 (0)