Skip to content

Commit da5d755

Browse files
authored
Merge pull request #4 from rapidclock/#a98h3-feature-crawled-links-limit
#a98h3 feature crawled links limit and #avmxf improved documentation
2 parents ced1e8c + 744f355 commit da5d755

File tree

9 files changed

+196
-39
lines changed

9 files changed

+196
-39
lines changed

.travis.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
sudo: false
2+
13
language: go
24
go:
35
- "1.11"
6+
7+
notifications:
8+
email: false
9+
10+
script:
11+
- go doc octopus
12+
- go doc adapter
13+
- go test -v ./...

README.md

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,36 @@
33
<br>
44
A concurent web crawler to crawl the web.
55

6-
### Current Features:
6+
## Current Features:
77
- Depth Limited Crawling
88
- User specified valid protocols
99
- User buildable adapters that the crawler feeds output to.
1010
- Filter Duplicates.
1111
- Filter URLs that fail a HEAD request.
1212
- User specifiable max timeout between two successive url requests.
13+
- Max Number of Links to be crawled.
14+
15+
16+
### Sample Implementation Snippet
17+
18+
```go
19+
package main
20+
21+
import (
22+
"github.com/rapidclock/web-octopus/adapter"
23+
"github.com/rapidclock/web-octopus/octopus"
24+
)
25+
26+
func main() {
27+
opAdapter := &adapter.StdOpAdapter{}
28+
29+
options := octopus.GetDefaultCrawlOptions()
30+
options.MaxCrawlDepth = 3
31+
options.TimeToQuit = 10
32+
options.OpAdapter = opAdapter
33+
34+
crawler := octopus.New(options)
35+
crawler.SetupSystem()
36+
crawler.BeginCrawling("https://www.example.com")
37+
}
38+
```

adapter/basicadapters.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@ import (
99
oct "github.com/rapidclock/web-octopus/octopus"
1010
)
1111

12-
// StdOpAdapter is an output adapter that just prints the output onto the screen.
12+
// StdOpAdapter is an output adapter that just prints the output onto the
13+
// screen.
14+
//
15+
// Sample Output Format is:
16+
// LinkNum - Depth - Url
1317
type StdOpAdapter struct{}
1418

1519
func (s *StdOpAdapter) Consume() *oct.NodeChSet {
@@ -22,10 +26,12 @@ func (s *StdOpAdapter) Consume() *oct.NodeChSet {
2226
},
2327
}
2428
go func() {
29+
i := 1
2530
for {
2631
select {
2732
case output := <-listenCh:
28-
fmt.Printf("%d - %s\n", output.Depth, output.UrlString)
33+
fmt.Printf("%d - %d - %s\n", i, output.Depth, output.UrlString)
34+
i++
2935
case <-quitCh:
3036
return
3137
}
@@ -34,7 +40,10 @@ func (s *StdOpAdapter) Consume() *oct.NodeChSet {
3440
return listenChSet
3541
}
3642

37-
// FileWriterAdapter is an output adapter that writes the output to a specified file.
43+
// FileWriterAdapter is an output adapter that writes the output to a
44+
// specified file.
45+
// Sample Output Format is:
46+
// Depth - Url
3847
type FileWriterAdapter struct {
3948
FilePath string
4049
}

adapter/doc.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
/*
2+
Package adapter contains implementations of the OutputAdapter interface
3+
of the octopus crawler.
4+
5+
This package contains two types of adapters StdOpAdapter and
6+
FileWriterAdapter. The StdOpAdapter prints the depth and url to standard output (usually the
7+
screen). The FileWriterAdapter prints the output to a specified File.
8+
9+
Both can be used as an OutputAdapter as part of the octopus crawler's
10+
CrawlOptions.
11+
*/
12+
package adapter

octopus/core.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
func (o *octopus) setupOctopus() {
1010
o.setupValidProtocolMap()
1111
o.setupTimeToQuit()
12+
o.setupMaxLinksCrawled()
1213
}
1314

1415
func (o *octopus) setupValidProtocolMap() {
@@ -26,6 +27,12 @@ func (o *octopus) setupTimeToQuit() {
2627
}
2728
}
2829

30+
func (o *octopus) setupMaxLinksCrawled() {
31+
if o.MaxCrawledUrls == 0 {
32+
panic("MaxCrawledUrls should either be negative or greater than 0.")
33+
}
34+
}
35+
2936
func (o *octopus) SetupSystem() {
3037
o.isReady = false
3138
o.setupOctopus()
@@ -49,7 +56,15 @@ func (o *octopus) SetupSystem() {
4956
pageParseChSet := o.makeParseNodeFromHtmlPipe(ingestChSet)
5057
depthLimitChSet := o.makeCrawlDepthFilterPipe(pageParseChSet)
5158
maxDelayChSet := o.makeMaxDelayPipe(depthLimitChSet)
52-
distributorChSet := o.makeDistributorPipe(maxDelayChSet, outAdapterChSet)
59+
60+
var distributorChSet *NodeChSet
61+
if o.MaxCrawledUrls < 0 {
62+
distributorChSet = o.makeDistributorPipe(maxDelayChSet, outAdapterChSet)
63+
} else {
64+
maxLinksCrawledChSet := o.makeLimitCrawlPipe(outAdapterChSet)
65+
distributorChSet = o.makeDistributorPipe(maxDelayChSet, maxLinksCrawledChSet)
66+
}
67+
5368
pageReqChSet := o.makePageRequisitionPipe(distributorChSet)
5469
invUrlFilterChSet := o.makeInvalidUrlFilterPipe(pageReqChSet)
5570
dupFilterChSet := o.makeDuplicateUrlFilterPipe(invUrlFilterChSet)
@@ -64,7 +79,7 @@ func (o *octopus) SetupSystem() {
6479

6580
func (o *octopus) BeginCrawling(baseUrlStr string) {
6681
if !o.isReady {
67-
log.Fatal("Call BuildSystem first to setup Octopus")
82+
panic("Call BuildSystem first to setup Octopus")
6883
}
6984
go func() {
7085
o.inputUrlStrChan <- baseUrlStr

octopus/doc.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
Package octopus implements a concurrent web crawler.
3+
The octopus uses a pipeline of channels to implement a non-blocking web crawler.
4+
The octopus also provides user configurable options that can be used to
5+
customize the behaviour of the crawler.
6+
7+
Features
8+
9+
Current Features of the crawler include:
10+
1. User specifiable Depth Limited Crawling
11+
2. User specified valid protocols
12+
3. User buildable adapters that the crawler feeds output to.
13+
4. Filter Duplicates.
14+
5. Filter URLs that fail a HEAD request.
15+
6. User specifiable max timeout between two successive url requests.
16+
7. User specifiable Max Number of Links to be crawled.
17+
18+
19+
Pipeline Overview
20+
21+
The overview of the Pipeline is given below:
22+
1. Ingest
23+
2. Link Absolution
24+
3. Protocol Filter
25+
4. Duplicate Filter
26+
5. Invalid Url Filter (Urls whose HEAD request Fails)
27+
6. Make GET Request
28+
7a. Send to Output Adapter
29+
7b. Check for Timeout (gap between two output on this channel).
30+
8. Max Links Crawled Limit Filter
31+
9. Depth Limit Filter
32+
10. Parse Page for more URLs.
33+
34+
Note: The output from 7b. is fed to 8.
35+
1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 7b -> 8 -> 9 -> 10 -> 1
36+
*/
37+
package octopus

octopus/modelfactory.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ const (
77
anchorTag = "a"
88
anchorAttrb = "href"
99
defaultTimeToQuit = 5
10+
defaultCrawlLimit int64 = -1
1011
)
1112

1213
// NewWithDefaultOptions - Create an Instance of the Octopus with the default CrawlOptions.
@@ -40,12 +41,13 @@ func createNode(parentUrlStr, urlStr string, depth int64) *Node {
4041
}
4142
}
4243

44+
// Returns an instance of CrawlOptions with the values set to sensible defaults.
4345
func GetDefaultCrawlOptions() *CrawlOptions {
4446
return &CrawlOptions{
4547
MaxCrawlDepth: defaultMaxDepth,
46-
MaxCrawlLinks: -1,
48+
MaxCrawledUrls: defaultCrawlLimit,
4749
StayWithinBaseHost: false,
48-
CrawlRatePerSec: -1,
50+
CrawlRate: -1,
4951
RespectRobots: false,
5052
IncludeBody: true,
5153
OpAdapter: nil,
@@ -54,6 +56,7 @@ func GetDefaultCrawlOptions() *CrawlOptions {
5456
}
5557
}
5658

59+
// Utility function to create a NodeChSet given a created Node and Quit Channel.
5760
func MakeNodeChSet(nodeCh chan<- *Node, quitCh chan<- int) *NodeChSet {
5861
return &NodeChSet{
5962
NodeCh: nodeCh,

octopus/models.go

Lines changed: 57 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,37 +12,52 @@ import (
1212
// to an instance of the crawler.
1313
type octopus struct {
1414
*CrawlOptions
15-
visited *sync.Map
16-
isReady bool
17-
adapterChSet *NodeChSet
18-
isValidProtocol map[string]bool
19-
timeToQuit time.Duration
20-
inputUrlStrChan chan string
21-
masterQuitCh chan int
15+
visited *sync.Map
16+
isReady bool
17+
adapterChSet *NodeChSet
18+
isValidProtocol map[string]bool
19+
timeToQuit time.Duration
20+
inputUrlStrChan chan string
21+
masterQuitCh chan int
22+
crawledUrlCounter int64
2223
}
2324

2425
// CrawlOptions is used to house options for crawling.
26+
//
2527
// You can specify depth of exploration for each link,
26-
// if crawler should ignore other hostnames (except from base host).
27-
// MaxCrawlDepth - Indicates the maximum depth that will be crawled,
28-
// for each new link.
29-
// MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
30-
// Note : When combined with DepthPerLink, it will combine both.
31-
// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
32-
// IncludeBody - Include the response Body in the crawled NodeInfo (for further processing).
33-
// OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
34-
// will pump output onto the implementation's channel returned by its Consume method.
35-
// CrawlRate is the rate at which requests will be made.
36-
// RespectRobots (unimplemented) choose whether to respect robots.txt or not.
37-
// ValidProtocols - This is an array containing the list of url protocols that
38-
// should be crawled.
39-
// TimeToQuit - represents the total time to wait between two new nodes to be
40-
// generated before the crawler quits. This is in seconds.
28+
// if crawler should ignore other host names (except from base host).
29+
//
30+
// MaxCrawlDepth - Indicates the maximum depth that will be crawled,
31+
// for each new link.
32+
//
33+
// MaxCrawledUrls - Specifies the Maximum Number of Unique Links that will be crawled.
34+
// Note : When combined with DepthPerLink, it will combine both.
35+
// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
36+
//
37+
// StayWithinBaseHost - (unimplemented) Ensures crawler stays within the
38+
// level 1 link's hostname.
39+
//
40+
// CrawlRate (unimplemented) is the rate at which requests will be made.
41+
// In seconds
42+
//
43+
// RespectRobots (unimplemented) choose whether to respect robots.txt or not.
44+
//
45+
// IncludeBody - (unimplemented) Include the response Body in the crawled
46+
// NodeInfo (for further processing).
47+
//
48+
// OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
49+
// will pump output onto the implementation's channel returned by its Consume method.
50+
//
51+
// ValidProtocols - This is an array containing the list of url protocols that
52+
// should be crawled.
53+
//
54+
// TimeToQuit - represents the total time to wait between two new nodes to be
55+
// generated before the crawler quits. This is in seconds.
4156
type CrawlOptions struct {
4257
MaxCrawlDepth int64
43-
MaxCrawlLinks int64
58+
MaxCrawledUrls int64
4459
StayWithinBaseHost bool
45-
CrawlRatePerSec int64
60+
CrawlRate int64
4661
RespectRobots bool
4762
IncludeBody bool
4863
OpAdapter OutputAdapter
@@ -63,12 +78,17 @@ type Node struct {
6378
Body io.ReadCloser
6479
}
6580

81+
// StdChannels are used to hold the standard set of channels that are used
82+
// for special operations. Will include channels for Logging, Statistics,
83+
// etc. in the future.
6684
type StdChannels struct {
6785
QuitCh chan<- int
6886
// logCh chan<- string
6987
// errorCh chan<- string
7088
}
7189

90+
// NodeChSet is the standard set of channels used to build the concurrency
91+
// pipelines in the crawler.
7292
type NodeChSet struct {
7393
NodeCh chan<- *Node
7494
*StdChannels
@@ -80,13 +100,19 @@ type ingestPipeChSet struct {
80100
QuitCh chan int
81101
}
82102

83-
// OutputAdapter is the interface for the Adapter that is used to handle
84-
// output from the Octopus Crawler.
85-
// The contract stipulates that the crawler provides the channel
86-
// to listen for a quit command.
87-
// The crawler pumps its output onto the returned channel of the Consume method.
88-
// Implementers of the interface should listen on this channel for output from
89-
// the crawler.
103+
// OutputAdapter is the interface that has to be implemented in order to
104+
// handle outputs from the octopus crawler.
105+
//
106+
// The octopus will call the OutputAdapter.Consume(
107+
// ) method and deliver all relevant output and quit signals on the channels
108+
// included in the received NodeChSet.
109+
//
110+
// This implies that it is the responsibility of the user who implements
111+
// OutputAdapter to handle processing the output of the crawler that is
112+
// delivered on the NodeChSet.NodeCh.
113+
//
114+
// Implementers of the interface should listen to the included channels in
115+
// the output of Consume() for output from the crawler.
90116
type OutputAdapter interface {
91117
Consume() *NodeChSet
92118
}

octopus/pipe_ctrl_limitcrawl.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package octopus
2+
3+
import (
4+
"sync/atomic"
5+
)
6+
7+
func (o *octopus) makeLimitCrawlPipe(inChSet *NodeChSet) *NodeChSet {
8+
return stdLinearNodeFunc(o.checkWithinLimit, inChSet)
9+
}
10+
11+
func (o *octopus) checkWithinLimit(node *Node, outChSet *NodeChSet) {
12+
if v := atomic.AddInt64(&o.crawledUrlCounter,
13+
1); v <= o.MaxCrawledUrls {
14+
outChSet.NodeCh <- node
15+
} else {
16+
outChSet.QuitCh <- 1
17+
o.masterQuitCh <- 1
18+
}
19+
}

0 commit comments

Comments
 (0)