@@ -2,51 +2,84 @@ package octopus
22
33import (
44 "io"
5+ "sync"
56 "time"
67)
78
8- // Node is used to represent each crawled link and its associated depth of crawl.
9- type Node struct {
10- URLString string
11- Depth int
12- }
13-
14- // webOctopus is a concurrent version of webSpider.
15- // It has an inbuilt parser based of htmlparser.Parser to collect all links in a web-page.
9+ // octopus is a concurrent web crawler.
10+ // It has an inbuilt parser based of html.NewTokenizer to collect all links in a web-page.
1611// It also has a CrawlOptions structure to initialize setting specific
1712// to an instance of the crawler.
18- type webOctopus struct {
19- CrawlOptions
20- visited map [Node ]bool
13+ type octopus struct {
14+ * CrawlOptions
15+ visited * sync.Map
16+ isReady bool
17+ adapterChSet * NodeChSet
18+ isValidProtocol map [string ]bool
19+ timeToQuit time.Duration
20+ inputUrlStrChan chan string
21+ masterQuitCh chan int
2122}
2223
2324// CrawlOptions is used to house options for crawling.
2425// You can specify depth of exploration for each link,
2526// if crawler should ignore other hostnames (except from base host).
26- // MaxLinksCrawled - Specifies the Maximum Number of Unique Links that will be crawled.
27+ // MaxCrawlDepth - Indicates the maximum depth that will be crawled,
28+ // for each new link.
29+ // MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
2730// Note : When combined with DepthPerLink, it will combine both.
2831// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
29- // IncludeBody - Include the response Body in the crawled Node (for further processing).
32+ // IncludeBody - Include the response Body in the crawled NodeInfo (for further processing).
3033// OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
3134// will pump output onto the implementation's channel returned by its Consume method.
3235// CrawlRate is the rate at which requests will be made.
3336// RespectRobots (unimplemented) choose whether to respect robots.txt or not.
37+ // ValidProtocols - This is an array containing the list of url protocols that
38+ // should be crawled.
39+ // TimeToQuit - represents the total time to wait between two new nodes to be
40+ // generated before the crawler quits. This is in seconds.
3441type CrawlOptions struct {
35- DepthPerLink int16
36- MaxLinksCrawled int64
42+ MaxCrawlDepth int64
43+ MaxCrawlLinks int64
3744 StayWithinBaseHost bool
38- BaseURLString string
39- CrawlRate time.Duration
45+ CrawlRatePerSec int64
4046 RespectRobots bool
4147 IncludeBody bool
4248 OpAdapter OutputAdapter
49+ ValidProtocols []string
50+ TimeToQuit int64
4351}
4452
45- type CrawlOutput struct {
46- Node
53+ // NodeInfo is used to represent each crawled link and its associated crawl depth.
54+ type NodeInfo struct {
55+ ParentUrlString string
56+ UrlString string
57+ Depth int64
58+ }
59+
60+ // Node encloses a NodeInfo and its Body (HTML) Content.
61+ type Node struct {
62+ * NodeInfo
4763 Body io.ReadCloser
4864}
4965
66+ type StdChannels struct {
67+ QuitCh chan <- int
68+ // logCh chan<- string
69+ // errorCh chan<- string
70+ }
71+
72+ type NodeChSet struct {
73+ NodeCh chan <- * Node
74+ * StdChannels
75+ }
76+
77+ type ingestPipeChSet struct {
78+ NodeCh chan * Node
79+ StrCh chan string
80+ QuitCh chan int
81+ }
82+
5083// OutputAdapter is the interface for the Adapter that is used to handle
5184// output from the Octopus Crawler.
5285// The contract stipulates that the crawler provides the channel
@@ -55,5 +88,5 @@ type CrawlOutput struct {
5588// Implementers of the interface should listen on this channel for output from
5689// the crawler.
5790type OutputAdapter interface {
58- Consume (quitCh <- chan bool ) chan <- CrawlOutput
91+ Consume () * NodeChSet
5992}
0 commit comments