-
Notifications
You must be signed in to change notification settings - Fork 24
Description
Cant crawl https://facebook.com the result is :
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO Beginning Crawl.
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO Crawling https://www.facebook.com/
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO Finished crawl.
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> Started: Thursday, May 14th 2015, 5:04:40 pm
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> Duration: 0 hours, 0 minutes, 0 seconds.
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> Page(s) crawled: 0
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> Page(s) nofollowed: 0
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> Request count: 0
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> Request errors: 0
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> 200 OK count : 0
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> 301 count : 0
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> 302 count : 0
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> 404 count : 0
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> Filtered due to
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> Disallowed domain: 0
[Thu May 14 2015 17:04:40 GMT+0200 (CEST)] INFO >> Blacklist: 0
The code is:
var roboto = require('roboto');
var html_strip = require('htmlstrip-native').html_strip;
var stripOptions = {
include_script : false,
include_style : false,
compact_whitespace : true
};
var crawler = new roboto.Crawler({
startUrls: [
'https://www.facebook.com/'
],
constrainToRootDomains: true,
obeyRobotsTxt: false,
obeyNofollow: false,
maxDepth: 50,
});
/*
Dead/broken links start
var sites = [];
var deadLinks = {
E503:[],
E500:[],
E404:[],
E403:[],
E302:[]
};
deadLinks.E503.push();
*/
// Returns time it takes to make the request for current site
crawler.parseField('requestTime', function(response) {
return (new Date().getTime()-response.request.headers['Start-time']);
});
// Returns url of current site
crawler.parseField('url', function(response) {
return response.url;
});
// Returns url of current site
crawler.parseField('statusCode', function(response) {
return response.statusCode;
});
// returns title of current site
crawler.parseField('title', function(response, $) {
return $('head title').text();
});
// returns count of words on current site
crawler.parseField('TextCount', function(response, $) {
var html = $('body').html();
if (html) {
return countWords(html_strip(html, stripOptions));
}
});
crawler.on('item', function(item) {
//console.log(item.url);
});
crawler.on('finish', function() {
});
crawler.crawl();
/*
Functions
/
function countWords(s){
s = s.replace(/(^\s)|(\s*$)/gi,"");//exclude start and end white-space
s = s.replace(/[ ]{2,}/gi," ");//2 or more space to 1
s = s.replace(/\n /,"\n"); // exclude newline with a start spacing
return s.split(' ').length;
}