rapidclock
diff --git a/‎adapter/basicadapters.go‎
Lines changed: 27 additions & 12 deletions b/‎adapter/basicadapters.go‎
Lines changed: 27 additions & 12 deletions
diff --git a/‎octopus/core.go‎
Lines changed: 88 additions & 1 deletion b/‎octopus/core.go‎
Lines changed: 88 additions & 1 deletion
diff --git a/‎octopus/modelfactory.go‎
Lines changed: 55 additions & 8 deletions b/‎octopus/modelfactory.go‎
Lines changed: 55 additions & 8 deletions
diff --git a/‎octopus/models.go‎
Lines changed: 53 additions & 20 deletions b/‎octopus/models.go‎
Lines changed: 53 additions & 20 deletions
diff --git a/‎octopus/pipe_augment_linkabsolution.go‎
Lines changed: 32 additions & 0 deletions b/‎octopus/pipe_augment_linkabsolution.go‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎octopus/pipe_filter_crawldepth.go‎
Lines changed: 11 additions & 0 deletions b/‎octopus/pipe_filter_crawldepth.go‎
Lines changed: 11 additions & 0 deletions
@@ -12,33 +12,48 @@ import (
 // StdOpAdapter is an output adapter that just prints the output onto the screen.
 type StdOpAdapter struct{}
 
-func (s *StdOpAdapter) Consume(quitCh <-chan bool) chan<- oct.CrawlOutput {
-	listenCh := make(chan oct.CrawlOutput)
+func (s *StdOpAdapter) Consume() *oct.NodeChSet {
+	listenCh := make(chan *oct.Node)
+	quitCh := make(chan int, 1)
+	listenChSet := &oct.NodeChSet{
+		NodeCh: listenCh,
+		StdChannels: &oct.StdChannels{
+			QuitCh: quitCh,
+		},
+	}
 	go func() {
 		for {
 			select {
 			case output := <-listenCh:
-				fmt.Printf("%d - %s\n", output.Depth, output.URLString)
+				fmt.Printf("%d - %s\n", output.Depth, output.UrlString)
 			case <-quitCh:
 				return
 			}
 		}
 	}()
-	return listenCh
+	return listenChSet
 }
 
 // FileWriterAdapter is an output adapter that writes the output to a specified file.
 type FileWriterAdapter struct {
 	FilePath string
 }
 
-func (fw *FileWriterAdapter) Consume(quitCh <-chan bool) chan<- oct.CrawlOutput {
-	listenCh := make(chan oct.CrawlOutput)
-	fw.writeToFile(quitCh, listenCh)
-	return listenCh
+func (fw *FileWriterAdapter) Consume() *oct.NodeChSet {
+	listenCh := make(chan *oct.Node)
+	quitCh := make(chan int, 1)
+	listenChSet := &oct.NodeChSet{
+		NodeCh: listenCh,
+		StdChannels: &oct.StdChannels{
+			QuitCh: quitCh,
+		},
+	}
+	fw.writeToFile(listenCh, quitCh)
+	return listenChSet
 }
 
-func (fw *FileWriterAdapter) writeToFile(quitCh <-chan bool, ch <-chan oct.CrawlOutput) {
+func (fw *FileWriterAdapter) writeToFile(listenCh chan *oct.Node,
+	quitCh chan int) {
 	fp, err := fw.getFilePointer()
 	if err != nil {
 		fp.Close()
@@ -48,8 +63,8 @@ func (fw *FileWriterAdapter) writeToFile(quitCh <-chan bool, ch <-chan oct.Crawl
 		defer fp.Close()
 		for {
 			select {
-			case output := <-ch:
-				fmt.Fprintf(fp, "%d - %s\n", output.Depth, output.URLString)
+			case output := <-listenCh:
+				fmt.Fprintf(fp, "%d - %s\n", output.Depth, output.UrlString)
 			case <-quitCh:
 				return
 			}
@@ -58,6 +73,6 @@ func (fw *FileWriterAdapter) writeToFile(quitCh <-chan bool, ch <-chan oct.Crawl
 }
 
 func (fw *FileWriterAdapter) getFilePointer() (w io.WriteCloser, err error) {
-	w, err = os.OpenFile(fw.FilePath, os.O_RDWR|os.O_CREATE, 0755)
+	w, err = os.OpenFile(fw.FilePath, os.O_RDWR|os.O_CREATE, 0644)
 	return
 }
@@ -1,3 +1,90 @@
 package octopus
 
-func (o *webOctopus) setup() {}
+import (
+	"fmt"
+	"log"
+	"time"
+)
+
+func (o *octopus) setupOctopus() {
+	o.setupValidProtocolMap()
+	o.setupTimeToQuit()
+}
+
+func (o *octopus) setupValidProtocolMap() {
+	o.isValidProtocol = make(map[string]bool)
+	for _, protocol := range o.ValidProtocols {
+		o.isValidProtocol[protocol] = true
+	}
+}
+
+func (o *octopus) setupTimeToQuit() {
+	if o.TimeToQuit > 0 {
+		o.timeToQuit = time.Duration(o.TimeToQuit) * time.Second
+	} else {
+		log.Fatalln("TimeToQuit is not greater than 0")
+	}
+}
+
+func (o *octopus) SetupSystem() {
+	o.isReady = false
+	o.setupOctopus()
+
+	ingestNodeCh := make(chan *Node)
+	ingestQuitCh := make(chan int, 1)
+	ingestStrCh := make(chan string)
+
+	ingestChSet := MakeNodeChSet(ingestNodeCh, ingestQuitCh)
+	inPipeChSet := &ingestPipeChSet{
+		ingestNodeCh,
+		ingestStrCh,
+		ingestQuitCh,
+	}
+
+	o.inputUrlStrChan = ingestStrCh
+	o.masterQuitCh = make(chan int, 1)
+
+	outAdapterChSet := o.OpAdapter.Consume()
+
+	pageParseChSet := o.makeParseNodeFromHtmlPipe(ingestChSet)
+	depthLimitChSet := o.makeCrawlDepthFilterPipe(pageParseChSet)
+	maxDelayChSet := o.makeMaxDelayPipe(depthLimitChSet)
+	distributorChSet := o.makeDistributorPipe(maxDelayChSet, outAdapterChSet)
+	pageReqChSet := o.makePageRequisitionPipe(distributorChSet)
+	invUrlFilterChSet := o.makeInvalidUrlFilterPipe(pageReqChSet)
+	dupFilterChSet := o.makeDuplicateUrlFilterPipe(invUrlFilterChSet)
+	protoFilterChSet := o.makeUrlProtocolFilterPipe(dupFilterChSet)
+	linkAbsChSet := o.makeLinkAbsolutionPipe(protoFilterChSet)
+
+	o.makeIngestPipe(inPipeChSet, linkAbsChSet)
+
+	<-time.After(500 * time.Millisecond)
+	o.isReady = true
+}
+
+func (o *octopus) BeginCrawling(baseUrlStr string) {
+	if !o.isReady {
+		log.Fatal("Call BuildSystem first to setup Octopus")
+	}
+	go func() {
+		o.inputUrlStrChan <- baseUrlStr
+	}()
+	<-o.masterQuitCh
+	fmt.Println("Master Kill Switch Activated")
+}
+
+func (o *octopus) GetInputUrlStrChan() chan<- string {
+	if o.isReady {
+		return o.inputUrlStrChan
+	} else {
+		return nil
+	}
+}
+
+func (o *octopus) GetMasterQuitChan() chan<- int {
+	if o.isReady {
+		return o.masterQuitCh
+	} else {
+		return nil
+	}
+}
@@ -1,17 +1,64 @@
 package octopus
 
+import "sync"
+
 const (
-	defaultMaxDepth int16 = 2
-	anchorTag             = "a"
-	anchorAttrb           = "href"
+	defaultMaxDepth   int64 = 2
+	anchorTag               = "a"
+	anchorAttrb             = "href"
+	defaultTimeToQuit       = 5
 )
 
-// New - Creates an Instance of the Octopus Crawler with the given options.
-func New(opt CrawlOptions) *webOctopus {
-	oct := &webOctopus{
+// NewWithDefaultOptions - Create an Instance of the Octopus with the default CrawlOptions.
+func NewWithDefaultOptions() *octopus {
+	oct := &octopus{
+		CrawlOptions: GetDefaultCrawlOptions(),
+		visited:      new(sync.Map),
+		isReady:      false,
+	}
+	return oct
+}
+
+// New - Create an Instance of the Octopus with the given CrawlOptions.
+func New(opt *CrawlOptions) *octopus {
+	oct := &octopus{
 		CrawlOptions: opt,
-		visited:      make(map[Node]bool),
+		visited:      new(sync.Map),
+		isReady:      false,
 	}
-	oct.setup()
 	return oct
 }
+
+func createNode(parentUrlStr, urlStr string, depth int64) *Node {
+	return &Node{
+		NodeInfo: &NodeInfo{
+			ParentUrlString: parentUrlStr,
+			UrlString:       urlStr,
+			Depth:           depth,
+		},
+		Body: nil,
+	}
+}
+
+func GetDefaultCrawlOptions() *CrawlOptions {
+	return &CrawlOptions{
+		MaxCrawlDepth:      defaultMaxDepth,
+		MaxCrawlLinks:      -1,
+		StayWithinBaseHost: false,
+		CrawlRatePerSec:    -1,
+		RespectRobots:      false,
+		IncludeBody:        true,
+		OpAdapter:          nil,
+		ValidProtocols:     []string{"http", "https"},
+		TimeToQuit:         defaultTimeToQuit,
+	}
+}
+
+func MakeNodeChSet(nodeCh chan<- *Node, quitCh chan<- int) *NodeChSet {
+	return &NodeChSet{
+		NodeCh: nodeCh,
+		StdChannels: &StdChannels{
+			QuitCh: quitCh,
+		},
+	}
+}
@@ -2,51 +2,84 @@ package octopus
 
 import (
 	"io"
+	"sync"
 	"time"
 )
 
-// Node is used to represent each crawled link and its associated depth of crawl.
-type Node struct {
-	URLString string
-	Depth     int
-}
-
-// webOctopus is a concurrent version of webSpider.
-// It has an inbuilt parser based of htmlparser.Parser to collect all links in a web-page.
+// octopus is a concurrent web crawler.
+// It has an inbuilt parser based of html.NewTokenizer to collect all links in a web-page.
 // It also has a CrawlOptions structure to initialize setting specific
 // to an instance of the crawler.
-type webOctopus struct {
-	CrawlOptions
-	visited map[Node]bool
+type octopus struct {
+	*CrawlOptions
+	visited         *sync.Map
+	isReady         bool
+	adapterChSet    *NodeChSet
+	isValidProtocol map[string]bool
+	timeToQuit      time.Duration
+	inputUrlStrChan chan string
+	masterQuitCh    chan int
 }
 
 // CrawlOptions is used to house options for crawling.
 // You can specify depth of exploration for each link,
 // if crawler should ignore other hostnames (except from base host).
-// MaxLinksCrawled - Specifies the Maximum Number of Unique Links that will be crawled.
+// MaxCrawlDepth - Indicates the maximum depth that will be crawled,
+// for each new link.
+// MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
 // Note : When combined with DepthPerLink, it will combine both.
 // Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
-// IncludeBody - Include the response Body in the crawled Node (for further processing).
+// IncludeBody - Include the response Body in the crawled NodeInfo (for further processing).
 // OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
 // will pump output onto the implementation's channel returned by its Consume method.
 // CrawlRate is the rate at which requests will be made.
 // RespectRobots (unimplemented) choose whether to respect robots.txt or not.
+// ValidProtocols - This is an array containing the list of url protocols that
+// should be crawled.
+// TimeToQuit - represents the total time to wait between two new nodes to be
+// generated before the crawler quits. This is in seconds.
 type CrawlOptions struct {
-	DepthPerLink       int16
-	MaxLinksCrawled    int64
+	MaxCrawlDepth      int64
+	MaxCrawlLinks      int64
 	StayWithinBaseHost bool
-	BaseURLString      string
-	CrawlRate          time.Duration
+	CrawlRatePerSec    int64
 	RespectRobots      bool
 	IncludeBody        bool
 	OpAdapter          OutputAdapter
+	ValidProtocols     []string
+	TimeToQuit         int64
 }
 
-type CrawlOutput struct {
-	Node
+// NodeInfo is used to represent each crawled link and its associated crawl depth.
+type NodeInfo struct {
+	ParentUrlString string
+	UrlString       string
+	Depth           int64
+}
+
+// Node encloses a NodeInfo and its Body (HTML) Content.
+type Node struct {
+	*NodeInfo
 	Body io.ReadCloser
 }
 
+type StdChannels struct {
+	QuitCh chan<- int
+	// logCh     chan<- string
+	// errorCh   chan<- string
+}
+
+type NodeChSet struct {
+	NodeCh chan<- *Node
+	*StdChannels
+}
+
+type ingestPipeChSet struct {
+	NodeCh chan *Node
+	StrCh  chan string
+	QuitCh chan int
+}
+
 // OutputAdapter is the interface for the Adapter that is used to handle
 // output from the Octopus Crawler.
 // The contract stipulates that the crawler provides the channel
@@ -55,5 +88,5 @@ type CrawlOutput struct {
 // Implementers of the interface should listen on this channel for output from
 // the crawler.
 type OutputAdapter interface {
-	Consume(quitCh <-chan bool) chan<- CrawlOutput
+	Consume() *NodeChSet
 }
@@ -0,0 +1,32 @@
+package octopus
+
+import (
+	"log"
+	"net/url"
+)
+
+func (o *octopus) makeLinkAbsolutionPipe(outChSet *NodeChSet) *NodeChSet {
+	return stdLinearNodeFunc(makeLinkAbsolute, outChSet)
+}
+
+func makeLinkAbsolute(node *Node, outChSet *NodeChSet) {
+	if node == nil || outChSet == nil {
+		log.Fatal("NIL ERROR")
+		return
+	}
+	if node.ParentUrlString != "" {
+		linkUrl, err := url.Parse(node.UrlString)
+		if err != nil {
+			return
+		}
+		if !linkUrl.IsAbs() {
+			baseUrl, err := url.Parse(node.ParentUrlString)
+			if err != nil {
+				return
+			}
+			absLinkUrl := baseUrl.ResolveReference(linkUrl)
+			node.UrlString = absLinkUrl.String()
+		}
+	}
+	outChSet.NodeCh <- node
+}
@@ -0,0 +1,11 @@
+package octopus
+
+func (o *octopus) makeCrawlDepthFilterPipe(outChSet *NodeChSet) *NodeChSet {
+	return stdLinearNodeFunc(o.filterByUrlDepth, outChSet)
+}
+
+func (o *octopus) filterByUrlDepth(node *Node, outChSet *NodeChSet) {
+	if node.Depth < o.MaxCrawlDepth {
+		outChSet.NodeCh <- node
+	}
+}