Skip to content

Commit f31f292

Browse files
authored
Merge pull request #2 from twiny/dev
v2
2 parents 4370831 + 4230470 commit f31f292

28 files changed

Lines changed: 1640 additions & 1088 deletions

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
cmd/tests
2+
config/config.yaml
3+
log/
4+
result/
5+
store/
6+
bin/
7+
bbolt/

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2020 Iss Meftah
3+
Copyright (c) 2022 Twiny
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

README.md

Lines changed: 53 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,81 +1,54 @@
11
## Spidy
2-
Spidy is a tool that crawl web pages from a given list of websites, it match all domains on each page and find expired domains among them.
3-
4-
# Usage
5-
compile the package
6-
`
7-
go build .
8-
`
9-
then run
10-
`
11-
./Spidy -config /path/to/setting.yaml
12-
`
13-
14-
# Output/Results
15-
results will be saved in ./log folder:
16-
17-
errors.txt: errors while scraping will be stored here. helpful for debugging.
18-
19-
domains.txt: list of all unique domain checked.
20-
21-
found.txt: list of the available domains found.
22-
23-
visited.txt: list of all unique visited URLs.
24-
25-
26-
## Engine Setting:
27-
main app setting:
28-
29-
**- worker :number of threads**
30-
31-
example: worker:10 => scrap 10 urls at once.
32-
33-
**- depth: page scraping depth**
34-
35-
example: depth:5 => visit the link from
36-
the 1st page and follow link found in 2nd page
37-
till the 5th page
38-
39-
**- parallel: number of processor**
40-
41-
example: parallel:5 => on the scraped page process
42-
5 link at once.
43-
44-
**- urls: path to a .txt file.**
45-
46-
path to the input.txt which will have a URLs
47-
a new URL in each line.
48-
49-
**- proxies: an array of proxy. accepts only HTTP proxies.**
50-
51-
if no proxy is added. proxy scraping will be disabled.
52-
if one proxy is added. all scraping will be through one proxy.
53-
if more then two proxies added. scraping will be rotated.
54-
example:
55-
56-
proxies: ["http://username:password@1.1.1.1:2345","http://username:password1.1.1.1:2345","http://username:password1.1.1.1:2345"]
57-
58-
to disable able proxy, use empty array, like:
59-
proxies: []
60-
61-
62-
**- tlds: an array of tld.**
63-
64-
example: [com, net, org]
65-
66-
an empty array will match all the 122 TLD in crawler/tld.go
67-
68-
**- random_delay: time duration**
69-
70-
a random time duration between requests
71-
example: 10s
72-
73-
**- timeout: time duration**
74-
75-
set timeout for HTTP requests
76-
example: 60s
77-
78-
# Big Thanks
79-
Colly V2 => https://github.com/gocolly/colly
80-
81-
[![Donate with Ethereum](https://en.cryptobadges.io/badge/small/0x94a003520Ad7F9aFF613c1cb6798a96256217EC9)](https://en.cryptobadges.io/donate/0x94a003520Ad7F9aFF613c1cb6798a96256217EC9)
2+
A tool that crawl websites to find domain names and checks thier availiabity.
3+
4+
### Install
5+
6+
```sh
7+
git clone https://github.com/twiny/spidy.git
8+
cd ./spidy
9+
10+
# build
11+
go build -o bin/spidy -v cmd/spidy/main.go
12+
13+
# run
14+
./bin/spidy -c config/config.yaml -u https://github.com
15+
```
16+
17+
## Configuration
18+
19+
```yaml
20+
# main crawler config
21+
crawler:
22+
max_depth: 10 # max depth of pages to visit per website.
23+
# filter: [] # regexp filter
24+
rate_limit: "1/5s" # 1 request per 5 sec
25+
max_body_size: "20MB" # max page body size
26+
user_agents: # array of user-agents
27+
- "Spidy/2.1; +https://github.com/ twiny/spidy"
28+
# proxies: [] # array of proxy. http(s), SOCKS5
29+
# Logs
30+
log:
31+
rotate: 7 # log rotation
32+
path: "./log" # log directory
33+
# Store
34+
store:
35+
ttl: "24h" # keep cache for 24h
36+
path: "./store" # store directory
37+
# Results
38+
result:
39+
path: ./result # result directory
40+
parralle: 3 # number of concurrent workers
41+
timeout: "5m" # request timeout
42+
tlds: ["biz", "cc", "com", "edu", "info", "net", "org", "tv"] # array of domain extension to check.
43+
```
44+
45+
46+
## TODO
47+
48+
- [ ] Add support to more `writers`.
49+
- [ ] Add terminal logging.
50+
- [ ] Add test cases.
51+
52+
## Issues
53+
54+
NOTE: This package is provided "as is" with no guarantee. Use it at your own risk and always test it yourself before using it in a production environment. If you find any issues, please create a new issue.

cmd/spidy/api/spider.go

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
package api
2+
3+
import (
4+
"context"
5+
_ "embed"
6+
"fmt"
7+
"log"
8+
"net/http"
9+
"os"
10+
"os/signal"
11+
"strconv"
12+
"sync"
13+
"syscall"
14+
15+
//
16+
17+
"github.com/twiny/spidy/v2/internal/pkg/spider/v1"
18+
"github.com/twiny/spidy/v2/internal/service/cache"
19+
"github.com/twiny/spidy/v2/internal/service/writer"
20+
21+
//
22+
"github.com/twiny/domaincheck"
23+
"github.com/twiny/flog"
24+
"github.com/twiny/wbot"
25+
)
26+
27+
//go:embed version
28+
var Version string
29+
30+
// Spider
31+
type Spider struct {
32+
wg *sync.WaitGroup
33+
setting *spider.Setting
34+
bot *wbot.WBot
35+
pages chan *spider.Page
36+
check *domaincheck.Checker
37+
store spider.Storage
38+
write spider.Writer
39+
log *flog.Logger
40+
}
41+
42+
// NewSpider
43+
func NewSpider(fp string) (*Spider, error) {
44+
// get settings
45+
setting := spider.ParseSetting(fp)
46+
47+
// crawler opts
48+
opts := []wbot.Option{
49+
wbot.SetParallel(setting.Parralle),
50+
wbot.SetMaxDepth(setting.Crawler.MaxDepth),
51+
wbot.SetRateLimit(setting.Crawler.Limit.Rate, setting.Crawler.Limit.Interval),
52+
wbot.SetMaxBodySize(setting.Crawler.MaxBodySize),
53+
wbot.SetUserAgents(setting.Crawler.UserAgents),
54+
wbot.SetProxies(setting.Crawler.Proxies),
55+
}
56+
57+
bot := wbot.NewWBot(opts...)
58+
59+
check, err := domaincheck.NewChecker()
60+
if err != nil {
61+
return nil, err
62+
}
63+
64+
// store
65+
store, err := cache.NewCache(setting.Store.TTL, setting.Store.Path)
66+
if err != nil {
67+
return nil, err
68+
}
69+
70+
// logger
71+
log, err := flog.NewLogger(setting.Log.Path, "spidy", setting.Log.Rotate)
72+
if err != nil {
73+
return nil, err
74+
}
75+
76+
write, err := writer.NewCSVWriter(setting.Result.Path)
77+
if err != nil {
78+
return nil, err
79+
}
80+
81+
return &Spider{
82+
wg: &sync.WaitGroup{},
83+
setting: setting,
84+
bot: bot,
85+
pages: make(chan *spider.Page, setting.Parralle),
86+
check: check,
87+
store: store,
88+
write: write,
89+
log: log,
90+
}, nil
91+
}
92+
93+
// Start
94+
func (s *Spider) Start(links []string) error {
95+
// go crawl
96+
s.wg.Add(len(links))
97+
for _, link := range links {
98+
go func(l string) {
99+
defer s.wg.Done()
100+
//
101+
if err := s.bot.Crawl(l); err != nil {
102+
s.log.Error(err.Error(), map[string]string{"url": l})
103+
}
104+
}(link)
105+
}
106+
107+
// check domains
108+
s.wg.Add(s.setting.Parralle)
109+
for i := 0; i < s.setting.Parralle; i++ {
110+
go func() {
111+
defer s.wg.Done()
112+
// results
113+
for res := range s.bot.Stream() {
114+
// if response is ok
115+
if res.Status != http.StatusOK {
116+
s.log.Info("bad HTTP status", map[string]string{
117+
"url": res.URL.String(),
118+
"status": strconv.Itoa(res.Status),
119+
})
120+
continue
121+
}
122+
123+
// extract domains
124+
domains := spider.FindDomains(res.Body)
125+
126+
// check availability
127+
for _, domain := range domains {
128+
root := fmt.Sprintf("%s.%s", domain.Name, domain.TLD)
129+
130+
// check if allowed extension
131+
if len(s.setting.TLDs) > 0 {
132+
if ok := s.setting.TLDs[domain.TLD]; !ok {
133+
s.log.Info("unsupported domain", map[string]string{
134+
"domain": root,
135+
"url": res.URL.String(),
136+
})
137+
continue
138+
}
139+
}
140+
141+
// skip if already checked
142+
if s.store.HasChecked(root) {
143+
s.log.Info("already checked", map[string]string{
144+
"domain": root,
145+
"url": res.URL.String(),
146+
})
147+
continue
148+
}
149+
150+
//
151+
ctx, cancel := context.WithTimeout(context.Background(), s.setting.Timeout)
152+
defer cancel()
153+
154+
status, err := s.check.Check(ctx, root)
155+
if err != nil {
156+
s.log.Error(err.Error(), map[string]string{
157+
"domain": root,
158+
"url": res.URL.String(),
159+
})
160+
continue
161+
}
162+
163+
// save domain
164+
if err := s.write.Write(&spider.Domain{
165+
URL: res.URL.String(),
166+
Name: domain.Name,
167+
TLD: domain.TLD,
168+
Status: status.String(),
169+
}); err != nil {
170+
s.log.Error(err.Error(), map[string]string{
171+
"domain": root,
172+
"url": res.URL.String(),
173+
})
174+
continue
175+
}
176+
177+
// terminal print
178+
fmt.Printf("[Spidy] == domain: %s - status %s\n", root, status.String())
179+
}
180+
}
181+
}()
182+
}
183+
184+
s.wg.Wait()
185+
return nil
186+
}
187+
188+
// Shutdown
189+
func (s *Spider) Shutdown() error {
190+
// attempt graceful shutdown
191+
sigs := make(chan os.Signal, 1)
192+
signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
193+
194+
<-sigs
195+
log.Println("shutting down ...")
196+
197+
// 2nd ctrl+c kills program
198+
go func() {
199+
sigs := make(chan os.Signal, 1)
200+
signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
201+
<-sigs
202+
log.Println("killing program ...")
203+
os.Exit(0)
204+
}()
205+
206+
s.bot.Close()
207+
s.log.Close()
208+
if err := s.store.Close(); err != nil {
209+
return err
210+
}
211+
os.Exit(0)
212+
return nil
213+
}

cmd/spidy/api/version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
2.0.0

0 commit comments

Comments
 (0)