Skip to content

Commit 1c7fcd7

Browse files
authored
Merge pull request #58 from mcarmonaa/feature/discovery-skip-forked-repos
*: add flag to the cli to skip forked repositories on discovery
2 parents 9ecab19 + c72e1e1 commit 1c7fcd7

File tree

5 files changed

+73
-5
lines changed

5 files changed

+73
-5
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Rooted repositories have a few particularities that you should know to work with
3131

3232
gitcollector entry point usage is done through the subcommand `download` (at this time is the only subcommand):
3333

34-
```
34+
```txt
3535
Usage:
3636
gitcollector [OPTIONS] download [download-OPTIONS]
3737
@@ -43,7 +43,9 @@ Help Options:
4343
--bucket= library bucketization level (default: 2) [$GITCOLLECTOR_LIBRARY_BUCKET]
4444
--tmp= directory to place generated temporal files (default: /tmp) [$GITCOLLECTOR_TMP]
4545
--workers= number of workers, default to GOMAXPROCS [$GITCOLLECTOR_WORKERS]
46+
--half-cpu set the number of workers to half of the set workers [$GITCOLLECTOR_HALF_CPU]
4647
--no-updates don't allow updates on already downloaded repositories [$GITCOLLECTOR_NO_UPDATES]
48+
--no-forks github forked repositories will not be downloaded [$GITCOLLECTOR_NO_FORKS]
4749
--orgs= list of github organization names separated by comma [$GITHUB_ORGANIZATIONS]
4850
--token= github token [$GITHUB_TOKEN]
4951
--metrics-db= uri to a database where metrics will be sent [$GITCOLLECTOR_METRICS_DB_URI]

cmd/gitcollector/subcmd/download.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ type DownloadCmd struct {
3030
Workers int `long:"workers" description:"number of workers, default to GOMAXPROCS" env:"GITCOLLECTOR_WORKERS"`
3131
HalfCPU bool `long:"half-cpu" description:"set the number of workers to half of the set workers" env:"GITCOLLECTOR_HALF_CPU"`
3232
NotAllowUpdates bool `long:"no-updates" description:"don't allow updates on already downloaded repositories" env:"GITCOLLECTOR_NO_UPDATES"`
33+
NoForks bool `long:"no-forks" description:"github forked repositories will not be downloaded" env:"GITCOLLECTOR_NO_FORKS"`
3334
Orgs string `long:"orgs" env:"GITHUB_ORGANIZATIONS" description:"list of github organization names separated by comma" required:"true"`
3435
Token string `long:"token" env:"GITHUB_TOKEN" description:"github token"`
3536
MetricsDBURI string `long:"metrics-db" env:"GITCOLLECTOR_METRICS_DB_URI" description:"uri to a database where metrics will be sent"`
@@ -139,7 +140,7 @@ func (c *DownloadCmd) Execute(args []string) error {
139140
wp.Run()
140141
log.Debugf("worker pool is running")
141142

142-
go runGHOrgProviders(log.New(nil), orgs, c.Token, download)
143+
go runGHOrgProviders(log.New(nil), orgs, c.Token, download, c.NoForks)
143144

144145
wp.Wait()
145146
log.Debugf("worker pool stopped successfully")
@@ -183,6 +184,7 @@ func runGHOrgProviders(
183184
orgs []string,
184185
token string,
185186
download chan gitcollector.Job,
187+
skipForks bool,
186188
) {
187189
var wg sync.WaitGroup
188190
wg.Add(len(orgs))
@@ -196,7 +198,9 @@ func runGHOrgProviders(
196198
AuthToken: token,
197199
},
198200
),
199-
&discovery.GHProviderOpts{},
201+
&discovery.GHProviderOpts{
202+
SkipForks: skipForks,
203+
},
200204
)
201205

202206
go func() {

discovery/provider.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ var (
2828

2929
// GHProviderOpts represents configuration options for a GHProvider.
3030
type GHProviderOpts struct {
31+
SkipForks bool
3132
WaitNewRepos bool
3233
WaitOnRateLimit bool
3334
StopTimeout time.Duration
@@ -158,6 +159,10 @@ func (p *GHProvider) enqueueJob(ctx context.Context) error {
158159
return nil
159160
}
160161

162+
if p.opts.SkipForks && repo.GetFork() {
163+
return nil
164+
}
165+
161166
endpoint, err := getEndpoint(repo)
162167
if err != nil {
163168
return nil

discovery/provider_test.go

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package discovery
22

33
import (
4+
"fmt"
5+
"os"
46
"strings"
57
"testing"
68
"time"
@@ -19,13 +21,14 @@ func TestGHProvider(t *testing.T) {
1921
timeToStop = 5 * time.Second
2022
)
2123

24+
token, _ := testToken()
2225
queue := make(chan gitcollector.Job, 50)
2326
provider := NewGHProvider(
2427
queue,
2528
NewGHOrgReposIter(org, &GHReposIterOpts{
2629
TimeNewRepos: 1 * time.Second,
2730
ResultsPerPage: 100,
28-
AuthToken: "",
31+
AuthToken: token,
2932
}),
3033
&GHProviderOpts{
3134
MaxJobBuffer: 50,
@@ -72,3 +75,58 @@ func TestGHProvider(t *testing.T) {
7275
req.True(strings.Contains(job.Endpoints[0], org))
7376
}
7477
}
78+
79+
func TestGHProviderSkipForks(t *testing.T) {
80+
var req = require.New(t)
81+
const org = "src-d"
82+
83+
token, skip := testToken()
84+
if skip != nil {
85+
t.Skip(skip.Error())
86+
}
87+
88+
queue := make(chan gitcollector.Job, 200)
89+
provider := NewGHProvider(
90+
queue,
91+
NewGHOrgReposIter(org, &GHReposIterOpts{
92+
AuthToken: token,
93+
}),
94+
&GHProviderOpts{
95+
SkipForks: true,
96+
MaxJobBuffer: 50,
97+
},
98+
)
99+
100+
done := make(chan struct{})
101+
var err error
102+
go func() {
103+
err = provider.Start()
104+
close(done)
105+
}()
106+
107+
<-done
108+
req.True(ErrNewRepositoriesNotFound.Is(err), err.Error())
109+
close(queue)
110+
forkedRepos := []string{"or-tools", "PyHive", "go-oniguruma"}
111+
for job := range queue {
112+
j, ok := job.(*library.Job)
113+
req.True(ok)
114+
req.Len(j.Endpoints, 1)
115+
116+
for _, forked := range forkedRepos {
117+
req.False(strings.Contains(j.Endpoints[0], forked))
118+
}
119+
}
120+
}
121+
122+
func testToken() (string, error) {
123+
token := os.Getenv("GITHUB_TOKEN")
124+
ci := os.Getenv("TRAVIS")
125+
var err error
126+
if token == "" && ci == "true" {
127+
err = fmt.Errorf("test running on travis CI but " +
128+
"couldn't find GITHUB_TOKEN")
129+
}
130+
131+
return token, err
132+
}

library/job.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,6 @@ func NewJobScheduleFn(
186186
}
187187

188188
if errClosedChan.Is(err) {
189-
println("CLOSE")
190189
download = nil
191190
}
192191
}

0 commit comments

Comments
 (0)