From bb91ed7d71a859111023da09e47eb4559694cc5a Mon Sep 17 00:00:00 2001 From: Wyatt Dahlenburg Date: Wed, 10 Oct 2018 21:27:38 -0500 Subject: [PATCH] Scrape Everything From Github --- README.md | 2 ++ core/git.go | 12 ++++--- core/github.go | 88 ++++++++++++++++++++++++++++++++++++++++++++++++- core/options.go | 2 ++ main.go | 59 ++++++++++++++++++++++++++++++--- 5 files changed, 154 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index b093ae0d..80642fc2 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,8 @@ Gitrob is a tool to help find potentially sensitive files pushed to public repos Suppress all output except for errors -threads int Number of concurrent threads (default number of logical CPUs) +-gather-all + Specify whether to pull all repositories from the domain ``` ### Saving session to a file diff --git a/core/git.go b/core/git.go index f5abbc5c..49f12a79 100644 --- a/core/git.go +++ b/core/git.go @@ -3,7 +3,6 @@ package core import ( "fmt" "io/ioutil" - "gopkg.in/src-d/go-git.v4" "gopkg.in/src-d/go-git.v4/plumbing" "gopkg.in/src-d/go-git.v4/plumbing/object" @@ -14,9 +13,14 @@ const ( EmptyTreeCommitId = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" ) -func CloneRepository(url *string, branch *string, depth int) (*git.Repository, string, error) { - urlVal := *url - branchVal := *branch +func CloneRepository(repo *GithubRepository, depth int) (*git.Repository, string, error) { + var urlVal string + if repo.CloneURL != nil { + urlVal = *repo.CloneURL + } else { + urlVal = *repo.URL + } + branchVal := *repo.DefaultBranch dir, err := ioutil.TempDir("", "gitrob") if err != nil { return nil, "", err diff --git a/core/github.go b/core/github.go index a1941701..1b9998c2 100644 --- a/core/github.go +++ b/core/github.go @@ -2,7 +2,6 @@ package core import ( "context" - "github.com/google/go-github/github" ) @@ -111,3 +110,90 @@ func GetOrganizationMembers(login *string, client *github.Client) ([]*GithubOwne } return allMembers, nil } + +func DetermineRepositoryCount(client *github.Client) (int64, error){ + ctx := context.Background() + opt := &github.RepositoryListAllOptions{ + Since: 0, + } + + sinceValue := 0 + lastValue := 0 + + for { + repos, _, err := client.Repositories.ListAll(ctx, opt) + if err != nil { + return -1, err + } + for _, repo := range repos { + if !*repo.Fork { + sinceValue = int(*repo.ID) + } + } + if len(repos) == 0 { + if sinceValue == lastValue { + return int64(sinceValue), nil + } + sinceValue = (lastValue + sinceValue) / 2 + } else { + lastValue = sinceValue + sinceValue *= 2 + } + + + opt = &github.RepositoryListAllOptions{ + Since: int64(sinceValue), + } + } + return 0, nil +} + +func GetAllRepositories(client *github.Client, start int64, end int64) ([]*GithubRepository, error) { + var allRepos []*GithubRepository + ctx := context.Background() + opt := &github.RepositoryListAllOptions{ + Since: start, + } + + hard_coded_branch := "master" + + scraped := false + sinceValue := start + + for scraped != true { + repos, _, err := client.Repositories.ListAll(ctx, opt) + if err != nil { + return allRepos, err + } + for _, repo := range repos { + if !*repo.Fork { + r := GithubRepository{ + Owner: repo.Owner.Login, + ID: repo.ID, + Name: repo.Name, + FullName: repo.FullName, + CloneURL: repo.CloneURL, + URL: repo.HTMLURL, + DefaultBranch: &hard_coded_branch, + Description: repo.Description, + Homepage: repo.Homepage, + } + allRepos = append(allRepos, &r) + + sinceValue = int64(*r.ID) + + if sinceValue >= end { + return allRepos, nil + } + } + } + if len(repos) == 0 { + scraped = true + } + opt = &github.RepositoryListAllOptions{ + Since: int64(sinceValue), + } + } + + return allRepos, nil +} diff --git a/core/options.go b/core/options.go index cf610816..dffb005d 100644 --- a/core/options.go +++ b/core/options.go @@ -8,6 +8,7 @@ type Options struct { CommitDepth *int GithubAccessToken *string `json:"-"` NoExpandOrgs *bool + GatherAll *bool Threads *int Save *string `json:"-"` Load *string `json:"-"` @@ -23,6 +24,7 @@ func ParseOptions() (Options, error) { CommitDepth: flag.Int("commit-depth", 500, "Number of repository commits to process"), GithubAccessToken: flag.String("github-access-token", "", "GitHub access token to use for API requests"), NoExpandOrgs: flag.Bool("no-expand-orgs", false, "Don't add members to targets when processing organizations"), + GatherAll: flag.Bool("gather-all", false, "Gather all repositories on the domain"), Threads: flag.Int("threads", 0, "Number of concurrent threads (default number of logical CPUs)"), Save: flag.String("save", "", "Save session to file"), Load: flag.String("load", "", "Load session file"), diff --git a/main.go b/main.go index a693ad89..fc84b262 100644 --- a/main.go +++ b/main.go @@ -6,7 +6,7 @@ import ( "strings" "sync" "time" - + "math" "github.com/michenriksen/gitrob/core" ) @@ -86,6 +86,52 @@ func GatherRepositories(sess *core.Session) { wg.Wait() } +func GatherReposConcurrent(sess *core.Session, thread_num int, start int64, end int64, wg *sync.WaitGroup) { + go func() { + sess.Out.Debug(" Thread [%d] for repository gathering: [%d:%d]\n", thread_num, start, end) + repos, err := core.GetAllRepositories(sess.GithubClient, start, end) + if err != nil { + sess.Out.Error(" Failed to retrieve all repositories %s\n", err) + } + + for _, repo := range repos { + sess.Out.Debug(" Retrieved repository: %s\n", *repo.FullName) + sess.AddRepository(repo) + } + + sess.Out.Info(" Thread [%d] Retrieved %d %s\n", thread_num, len(repos), core.Pluralize(len(repos), "repository", "repositories")) + wg.Done() + }() +} + +func GatherAllRepositories(sess *core.Session) { + var wg sync.WaitGroup + var threadNum int + + count, err := core.DetermineRepositoryCount(sess.GithubClient) + if err != nil { + sess.Out.Error( "Failed to find upper limit on repositories. Setting threads to 1") + threadNum = 1 + count = math.MaxInt64 + } else { + threadNum = *sess.Options.Threads + } + + sess.Out.Debug("Threads for repository gathering: %d\n", threadNum) + + bounds := int(count) / threadNum + + wg.Add(threadNum) + for i := 0; i < threadNum; i++ { + end := int64((i + 1) * bounds) + start := int64(end - int64(bounds)) + GatherReposConcurrent(sess, i, start, end, &wg) + } + + wg.Wait() + sess.Out.Info("Finished Pulling All Repos\n") +} + func AnalyzeRepositories(sess *core.Session) { sess.Stats.Status = core.StatusAnalyzing var ch = make(chan *core.GithubRepository, len(sess.Repositories)) @@ -115,7 +161,7 @@ func AnalyzeRepositories(sess *core.Session) { } sess.Out.Debug("[THREAD #%d][%s] Cloning repository...\n", tid, *repo.FullName) - clone, path, err := core.CloneRepository(repo.CloneURL, repo.DefaultBranch, *sess.Options.CommitDepth) + clone, path, err := core.CloneRepository(repo, *sess.Options.CommitDepth) if err != nil { if err.Error() != "remote repository is empty" { sess.Out.Error("Error cloning repository %s: %s\n", *repo.FullName, err) @@ -223,12 +269,17 @@ func main() { if sess.Stats.Status == "finished" { sess.Out.Important("Loaded session file: %s\n", *sess.Options.Load) } else { - if len(sess.Options.Logins) == 0 { + if len(sess.Options.Logins) == 0 && !*sess.Options.GatherAll { sess.Out.Fatal("Please provide at least one GitHub organization or user\n") - } + } GatherTargets(sess) GatherRepositories(sess) + + if *sess.Options.GatherAll { + GatherAllRepositories(sess) + } + AnalyzeRepositories(sess) sess.Finish()