@@ -12,37 +12,52 @@ import (
1212// to an instance of the crawler.
1313type octopus struct {
1414 * CrawlOptions
15- visited * sync.Map
16- isReady bool
17- adapterChSet * NodeChSet
18- isValidProtocol map [string ]bool
19- timeToQuit time.Duration
20- inputUrlStrChan chan string
21- masterQuitCh chan int
15+ visited * sync.Map
16+ isReady bool
17+ adapterChSet * NodeChSet
18+ isValidProtocol map [string ]bool
19+ timeToQuit time.Duration
20+ inputUrlStrChan chan string
21+ masterQuitCh chan int
22+ crawledUrlCounter int64
2223}
2324
2425// CrawlOptions is used to house options for crawling.
26+ //
2527// You can specify depth of exploration for each link,
26- // if crawler should ignore other hostnames (except from base host).
27- // MaxCrawlDepth - Indicates the maximum depth that will be crawled,
28- // for each new link.
29- // MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
30- // Note : When combined with DepthPerLink, it will combine both.
31- // Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
32- // IncludeBody - Include the response Body in the crawled NodeInfo (for further processing).
33- // OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
34- // will pump output onto the implementation's channel returned by its Consume method.
35- // CrawlRate is the rate at which requests will be made.
36- // RespectRobots (unimplemented) choose whether to respect robots.txt or not.
37- // ValidProtocols - This is an array containing the list of url protocols that
38- // should be crawled.
39- // TimeToQuit - represents the total time to wait between two new nodes to be
40- // generated before the crawler quits. This is in seconds.
28+ // if crawler should ignore other host names (except from base host).
29+ //
30+ // MaxCrawlDepth - Indicates the maximum depth that will be crawled,
31+ // for each new link.
32+ //
33+ // MaxCrawledUrls - Specifies the Maximum Number of Unique Links that will be crawled.
34+ // Note : When combined with DepthPerLink, it will combine both.
35+ // Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
36+ //
37+ // StayWithinBaseHost - (unimplemented) Ensures crawler stays within the
38+ // level 1 link's hostname.
39+ //
40+ // CrawlRate (unimplemented) is the rate at which requests will be made.
41+ // In seconds
42+ //
43+ // RespectRobots (unimplemented) choose whether to respect robots.txt or not.
44+ //
45+ // IncludeBody - (unimplemented) Include the response Body in the crawled
46+ // NodeInfo (for further processing).
47+ //
48+ // OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
49+ // will pump output onto the implementation's channel returned by its Consume method.
50+ //
51+ // ValidProtocols - This is an array containing the list of url protocols that
52+ // should be crawled.
53+ //
54+ // TimeToQuit - represents the total time to wait between two new nodes to be
55+ // generated before the crawler quits. This is in seconds.
4156type CrawlOptions struct {
4257 MaxCrawlDepth int64
43- MaxCrawlLinks int64
58+ MaxCrawledUrls int64
4459 StayWithinBaseHost bool
45- CrawlRatePerSec int64
60+ CrawlRate int64
4661 RespectRobots bool
4762 IncludeBody bool
4863 OpAdapter OutputAdapter
@@ -63,12 +78,17 @@ type Node struct {
6378 Body io.ReadCloser
6479}
6580
81+ // StdChannels are used to hold the standard set of channels that are used
82+ // for special operations. Will include channels for Logging, Statistics,
83+ // etc. in the future.
6684type StdChannels struct {
6785 QuitCh chan <- int
6886 // logCh chan<- string
6987 // errorCh chan<- string
7088}
7189
90+ // NodeChSet is the standard set of channels used to build the concurrency
91+ // pipelines in the crawler.
7292type NodeChSet struct {
7393 NodeCh chan <- * Node
7494 * StdChannels
@@ -80,13 +100,19 @@ type ingestPipeChSet struct {
80100 QuitCh chan int
81101}
82102
83- // OutputAdapter is the interface for the Adapter that is used to handle
84- // output from the Octopus Crawler.
85- // The contract stipulates that the crawler provides the channel
86- // to listen for a quit command.
87- // The crawler pumps its output onto the returned channel of the Consume method.
88- // Implementers of the interface should listen on this channel for output from
89- // the crawler.
103+ // OutputAdapter is the interface that has to be implemented in order to
104+ // handle outputs from the octopus crawler.
105+ //
106+ // The octopus will call the OutputAdapter.Consume(
107+ // ) method and deliver all relevant output and quit signals on the channels
108+ // included in the received NodeChSet.
109+ //
110+ // This implies that it is the responsibility of the user who implements
111+ // OutputAdapter to handle processing the output of the crawler that is
112+ // delivered on the NodeChSet.NodeCh.
113+ //
114+ // Implementers of the interface should listen to the included channels in
115+ // the output of Consume() for output from the crawler.
90116type OutputAdapter interface {
91117 Consume () * NodeChSet
92118}
0 commit comments