Cant crawl Facebook.com

Cant crawl https://facebook.com the result is :
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO Beginning Crawl.  
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO Crawling https://www.facebook.com/
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO Finished crawl.  
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> Started: Thursday, May 14th 2015, 5:04:40 pm 
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> Duration: 0 hours, 0 minutes, 0 seconds.
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> Page(s) crawled: 0
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> Page(s) nofollowed: 0
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> Request count: 0
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> Request errors: 0
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> 200 OK count : 0
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> 301 count : 0 
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> 302 count : 0 
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> 404 count : 0 
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO   >> Filtered due to
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO     >> Disallowed domain: 0
    [Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO     >> Blacklist: 0
The code is:
var roboto = require('roboto');
var html_strip = require('htmlstrip-native').html_strip;
var stripOptions = {
  include_script : false,
  include_style : false,
  compact_whitespace : true
};

var crawler = new roboto.Crawler({
  startUrls: [
    'https://www.facebook.com/'
  ],
  constrainToRootDomains: true,
  obeyRobotsTxt: false,
  obeyNofollow: false,
  maxDepth: 50,  
});

/*

```
Dead/broken links start
```

var sites = [];
var deadLinks = {
    E503:[],
    E500:[],
    E404:[],
    E403:[],
    E302:[]
};

 deadLinks.E503.push();
*/  
// Returns time it takes to make the request for current site
crawler.parseField('requestTime', function(response) {

  return (new Date().getTime()-response.request.headers['Start-time']);
});
// Returns url of current site
crawler.parseField('url', function(response) {
  return response.url;
});
// Returns url of current site
crawler.parseField('statusCode', function(response) {
  return response.statusCode;
});
// returns title of current site
crawler.parseField('title', function(response, $) {
  return $('head title').text();
});
// returns count of words on current site
crawler.parseField('TextCount', function(response, $) {
  var html = $('body').html();
  if (html) {
    return countWords(html_strip(html, stripOptions));
  }
});

crawler.on('item', function(item) {
    //console.log(item.url);

});

crawler.on('finish', function() {

});

crawler.crawl();

/*
    Functions

_/
function countWords(s){
    s = s.replace(/(^\s_)|(\s*$)/gi,"");//exclude  start and end white-space
    s = s.replace(/[ ]{2,}/gi," ");//2 or more space to 1
    s = s.replace(/\n /,"\n"); // exclude newline with a start spacing
    return s.split(' ').length; 
}


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Cant crawl Facebook.com #11

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Cant crawl Facebook.com #11

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions