Merge pull request #4 from rapidclock/#a98h3-feature-crawled-links-limit

rapidclock · web-flow · commit da5d755ab3c0 · 2018-11-11T19:42:11.000-08:00
#a98h3 feature crawled links limit and #avmxf improved documentation
diff --git a/.travis.yml b/.travis.yml
@@ -1,3 +1,13 @@
+sudo: false
+
 language: go
 go:
   - "1.11"
+
+notifications:
+  email: false
+
+script:
+  - go doc octopus
+  - go doc adapter
+  - go test -v ./...
diff --git a/README.md b/README.md
@@ -3,10 +3,36 @@
 <br>
 A concurent web crawler to crawl the web.
 
-### Current Features:
+## Current Features:
 - Depth Limited Crawling
 - User specified valid protocols
 - User buildable adapters that the crawler feeds output to.
 - Filter Duplicates.
 - Filter URLs that fail a HEAD request.
 - User specifiable max timeout between two successive url requests.
+- Max Number of Links to be crawled.
+
+
+### Sample Implementation Snippet
+
+```go
+package main
+
+import (
+	"github.com/rapidclock/web-octopus/adapter"
+	"github.com/rapidclock/web-octopus/octopus"
+)
+
+func main() {
+	opAdapter := &adapter.StdOpAdapter{}
+	
+	options := octopus.GetDefaultCrawlOptions()
+	options.MaxCrawlDepth = 3
+	options.TimeToQuit = 10
+	options.OpAdapter = opAdapter
+	
+	crawler := octopus.New(options)
+	crawler.SetupSystem()
+	crawler.BeginCrawling("https://www.example.com")
+}
+```
diff --git a/adapter/basicadapters.go b/adapter/basicadapters.go
@@ -9,7 +9,11 @@ import (
 	oct "github.com/rapidclock/web-octopus/octopus"
 )
 
-// StdOpAdapter is an output adapter that just prints the output onto the screen.
+// StdOpAdapter is an output adapter that just prints the output onto the
+// screen.
+//
+// Sample Output Format is:
+// 	LinkNum - Depth - Url
 type StdOpAdapter struct{}
 
 func (s *StdOpAdapter) Consume() *oct.NodeChSet {
@@ -22,10 +26,12 @@ func (s *StdOpAdapter) Consume() *oct.NodeChSet {
 		},
 	}
 	go func() {
+		i := 1
 		for {
 			select {
 			case output := <-listenCh:
-				fmt.Printf("%d - %s\n", output.Depth, output.UrlString)
+				fmt.Printf("%d - %d - %s\n", i, output.Depth, output.UrlString)
+				i++
 			case <-quitCh:
 				return
 			}
@@ -34,7 +40,10 @@ func (s *StdOpAdapter) Consume() *oct.NodeChSet {
 	return listenChSet
 }
 
-// FileWriterAdapter is an output adapter that writes the output to a specified file.
+// FileWriterAdapter is an output adapter that writes the output to a
+// specified file.
+// Sample Output Format is:
+// 	Depth - Url
 type FileWriterAdapter struct {
 	FilePath string
 }
diff --git a/adapter/doc.go b/adapter/doc.go
@@ -0,0 +1,12 @@
+/*
+Package adapter contains implementations of the OutputAdapter interface
+of the octopus crawler.
+
+This package contains two types of adapters StdOpAdapter and
+FileWriterAdapter. The StdOpAdapter prints the depth and url to standard output (usually the
+screen). The FileWriterAdapter prints the output to a specified File.
+
+Both can be used as an OutputAdapter as part of the octopus crawler's
+CrawlOptions.
+ */
+package adapter
diff --git a/octopus/core.go b/octopus/core.go
@@ -9,6 +9,7 @@ import (
 func (o *octopus) setupOctopus() {
 	o.setupValidProtocolMap()
 	o.setupTimeToQuit()
+	o.setupMaxLinksCrawled()
 }
 
 func (o *octopus) setupValidProtocolMap() {
@@ -26,6 +27,12 @@ func (o *octopus) setupTimeToQuit() {
 	}
 }
 
+func (o *octopus) setupMaxLinksCrawled() {
+	if o.MaxCrawledUrls == 0 {
+		panic("MaxCrawledUrls should either be negative or greater than 0.")
+	}
+}
+
 func (o *octopus) SetupSystem() {
 	o.isReady = false
 	o.setupOctopus()
@@ -49,7 +56,15 @@ func (o *octopus) SetupSystem() {
 	pageParseChSet := o.makeParseNodeFromHtmlPipe(ingestChSet)
 	depthLimitChSet := o.makeCrawlDepthFilterPipe(pageParseChSet)
 	maxDelayChSet := o.makeMaxDelayPipe(depthLimitChSet)
-	distributorChSet := o.makeDistributorPipe(maxDelayChSet, outAdapterChSet)
+
+	var distributorChSet *NodeChSet
+	if o.MaxCrawledUrls < 0 {
+		distributorChSet = o.makeDistributorPipe(maxDelayChSet, outAdapterChSet)
+	} else {
+		maxLinksCrawledChSet := o.makeLimitCrawlPipe(outAdapterChSet)
+		distributorChSet = o.makeDistributorPipe(maxDelayChSet, maxLinksCrawledChSet)
+	}
+
 	pageReqChSet := o.makePageRequisitionPipe(distributorChSet)
 	invUrlFilterChSet := o.makeInvalidUrlFilterPipe(pageReqChSet)
 	dupFilterChSet := o.makeDuplicateUrlFilterPipe(invUrlFilterChSet)
@@ -64,7 +79,7 @@ func (o *octopus) SetupSystem() {
 
 func (o *octopus) BeginCrawling(baseUrlStr string) {
 	if !o.isReady {
-		log.Fatal("Call BuildSystem first to setup Octopus")
+		panic("Call BuildSystem first to setup Octopus")
 	}
 	go func() {
 		o.inputUrlStrChan <- baseUrlStr
diff --git a/octopus/doc.go b/octopus/doc.go
@@ -0,0 +1,37 @@
+/*
+Package octopus implements a concurrent web crawler.
+The octopus uses a pipeline of channels to implement a non-blocking web crawler.
+The octopus also provides user configurable options that can be used to
+customize the behaviour of the crawler.
+
+Features
+
+Current Features of the crawler include:
+	1. User specifiable Depth Limited Crawling
+	2. User specified valid protocols
+	3. User buildable adapters that the crawler feeds output to.
+	4. Filter Duplicates.
+	5. Filter URLs that fail a HEAD request.
+	6. User specifiable max timeout between two successive url requests.
+	7. User specifiable Max Number of Links to be crawled.
+
+
+Pipeline Overview
+
+The overview of the Pipeline is given below:
+	1. Ingest
+	2. Link Absolution
+	3. Protocol Filter
+	4. Duplicate Filter
+	5. Invalid Url Filter (Urls whose HEAD request Fails)
+	6. Make GET Request
+	7a. Send to Output Adapter
+	7b. Check for Timeout (gap between two output on this channel).
+	8. Max Links Crawled Limit Filter
+	9. Depth Limit Filter
+	10. Parse Page for more URLs.
+
+Note: The output from 7b. is fed to 8.
+	1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 7b -> 8 -> 9 -> 10 -> 1
+ */
+package octopus
diff --git a/octopus/modelfactory.go b/octopus/modelfactory.go
@@ -7,6 +7,7 @@ const (
 	anchorTag               = "a"
 	anchorAttrb             = "href"
 	defaultTimeToQuit       = 5
+	defaultCrawlLimit int64 = -1
 )
 
 // NewWithDefaultOptions - Create an Instance of the Octopus with the default CrawlOptions.
@@ -40,12 +41,13 @@ func createNode(parentUrlStr, urlStr string, depth int64) *Node {
 	}
 }
 
+// Returns an instance of CrawlOptions with the values set to sensible defaults.
 func GetDefaultCrawlOptions() *CrawlOptions {
 	return &CrawlOptions{
 		MaxCrawlDepth:      defaultMaxDepth,
-		MaxCrawlLinks:      -1,
+		MaxCrawledUrls:     defaultCrawlLimit,
 		StayWithinBaseHost: false,
-		CrawlRatePerSec:    -1,
+		CrawlRate:          -1,
 		RespectRobots:      false,
 		IncludeBody:        true,
 		OpAdapter:          nil,
@@ -54,6 +56,7 @@ func GetDefaultCrawlOptions() *CrawlOptions {
 	}
 }
 
+// Utility function to create a NodeChSet given a created Node and Quit Channel.
 func MakeNodeChSet(nodeCh chan<- *Node, quitCh chan<- int) *NodeChSet {
 	return &NodeChSet{
 		NodeCh: nodeCh,
diff --git a/octopus/models.go b/octopus/models.go
@@ -12,37 +12,52 @@ import (
 // to an instance of the crawler.
 type octopus struct {
 	*CrawlOptions
-	visited         *sync.Map
-	isReady         bool
-	adapterChSet    *NodeChSet
-	isValidProtocol map[string]bool
-	timeToQuit      time.Duration
-	inputUrlStrChan chan string
-	masterQuitCh    chan int
+	visited           *sync.Map
+	isReady           bool
+	adapterChSet      *NodeChSet
+	isValidProtocol   map[string]bool
+	timeToQuit        time.Duration
+	inputUrlStrChan   chan string
+	masterQuitCh      chan int
+	crawledUrlCounter int64
 }
 
 // CrawlOptions is used to house options for crawling.
+//
 // You can specify depth of exploration for each link,
-// if crawler should ignore other hostnames (except from base host).
-// MaxCrawlDepth - Indicates the maximum depth that will be crawled,
-// for each new link.
-// MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
-// Note : When combined with DepthPerLink, it will combine both.
-// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
-// IncludeBody - Include the response Body in the crawled NodeInfo (for further processing).
-// OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
-// will pump output onto the implementation's channel returned by its Consume method.
-// CrawlRate is the rate at which requests will be made.
-// RespectRobots (unimplemented) choose whether to respect robots.txt or not.
-// ValidProtocols - This is an array containing the list of url protocols that
-// should be crawled.
-// TimeToQuit - represents the total time to wait between two new nodes to be
-// generated before the crawler quits. This is in seconds.
+// if crawler should ignore other host names (except from base host).
+//
+// 	MaxCrawlDepth - Indicates the maximum depth that will be crawled,
+// 	for each new link.
+//
+// 	MaxCrawledUrls - Specifies the Maximum Number of Unique Links that will be crawled.
+// 	Note : When combined with DepthPerLink, it will combine both.
+// 	Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
+//
+// 	StayWithinBaseHost - (unimplemented) Ensures crawler stays within the
+// 	level 1 link's hostname.
+//
+// 	CrawlRate (unimplemented) is the rate at which requests will be made.
+// 	In seconds
+//
+// 	RespectRobots (unimplemented) choose whether to respect robots.txt or not.
+//
+// 	IncludeBody - (unimplemented) Include the response Body in the crawled
+// 	NodeInfo (for further processing).
+//
+// 	OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
+// 	will pump output onto the implementation's channel returned by its Consume method.
+//
+// 	ValidProtocols - This is an array containing the list of url protocols that
+// 	should be crawled.
+//
+// 	TimeToQuit - represents the total time to wait between two new nodes to be
+// 	generated before the crawler quits. This is in seconds.
 type CrawlOptions struct {
 	MaxCrawlDepth      int64
-	MaxCrawlLinks      int64
+	MaxCrawledUrls     int64
 	StayWithinBaseHost bool
-	CrawlRatePerSec    int64
+	CrawlRate          int64
 	RespectRobots      bool
 	IncludeBody        bool
 	OpAdapter          OutputAdapter
@@ -63,12 +78,17 @@ type Node struct {
 	Body io.ReadCloser
 }
 
+// StdChannels are used to hold the standard set of channels that are used
+// for special operations. Will include channels for Logging, Statistics,
+// etc. in the future.
 type StdChannels struct {
 	QuitCh chan<- int
 	// logCh     chan<- string
 	// errorCh   chan<- string
 }
 
+// NodeChSet is the standard set of channels used to build the concurrency
+// pipelines in the crawler.
 type NodeChSet struct {
 	NodeCh chan<- *Node
 	*StdChannels
@@ -80,13 +100,19 @@ type ingestPipeChSet struct {
 	QuitCh chan int
 }
 
-// OutputAdapter is the interface for the Adapter that is used to handle
-// output from the Octopus Crawler.
-// The contract stipulates that the crawler provides the channel
-// to listen for a quit command.
-// The crawler pumps its output onto the returned channel of the Consume method.
-// Implementers of the interface should listen on this channel for output from
-// the crawler.
+// OutputAdapter is the interface that has to be implemented in order to
+// handle outputs from the octopus crawler.
+//
+// The octopus will call the OutputAdapter.Consume(
+// ) method and deliver all relevant output and quit signals on the channels
+// included in the received NodeChSet.
+//
+// This implies that it is the responsibility of the user who implements
+// OutputAdapter to handle processing the output of the crawler that is
+// delivered on the NodeChSet.NodeCh.
+//
+// Implementers of the interface should listen to the included channels in
+// the output of Consume() for output from the crawler.
 type OutputAdapter interface {
 	Consume() *NodeChSet
 }
diff --git a/octopus/pipe_ctrl_limitcrawl.go b/octopus/pipe_ctrl_limitcrawl.go
@@ -0,0 +1,19 @@
+package octopus
+
+import (
+	"sync/atomic"
+)
+
+func (o *octopus) makeLimitCrawlPipe(inChSet *NodeChSet) *NodeChSet {
+	return stdLinearNodeFunc(o.checkWithinLimit, inChSet)
+}
+
+func (o *octopus) checkWithinLimit(node *Node, outChSet *NodeChSet) {
+	if v := atomic.AddInt64(&o.crawledUrlCounter,
+		1); v <= o.MaxCrawledUrls {
+		outChSet.NodeCh <- node
+	} else {
+		outChSet.QuitCh <- 1
+		o.masterQuitCh <- 1
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ const (`
`7`	`7`	`anchorTag = "a"`
`8`	`8`	`anchorAttrb = "href"`
`9`	`9`	`defaultTimeToQuit = 5`
	`10`	`+ defaultCrawlLimit int64 = -1`
`10`	`11`	`)`
`11`	`12`
`12`	`13`	`// NewWithDefaultOptions - Create an Instance of the Octopus with the default CrawlOptions.`
`@@ -40,12 +41,13 @@ func createNode(parentUrlStr, urlStr string, depth int64) *Node {`
`40`	`41`	`}`
`41`	`42`	`}`
`42`	`43`
	`44`	`+// Returns an instance of CrawlOptions with the values set to sensible defaults.`
`43`	`45`	`func GetDefaultCrawlOptions() *CrawlOptions {`
`44`	`46`	`return &CrawlOptions{`
`45`	`47`	`MaxCrawlDepth: defaultMaxDepth,`
`46`		`- MaxCrawlLinks: -1,`
	`48`	`+ MaxCrawledUrls: defaultCrawlLimit,`
`47`	`49`	`StayWithinBaseHost: false,`
`48`		`- CrawlRatePerSec: -1,`
	`50`	`+ CrawlRate: -1,`
`49`	`51`	`RespectRobots: false,`
`50`	`52`	`IncludeBody: true,`
`51`	`53`	`OpAdapter: nil,`
`@@ -54,6 +56,7 @@ func GetDefaultCrawlOptions() *CrawlOptions {`
`54`	`56`	`}`
`55`	`57`	`}`
`56`	`58`
	`59`	`+// Utility function to create a NodeChSet given a created Node and Quit Channel.`
`57`	`60`	`func MakeNodeChSet(nodeCh chan<- Node, quitCh chan<- int) NodeChSet {`
`58`	`61`	`return &NodeChSet{`
`59`	`62`	`NodeCh: nodeCh,`