diff --git a/intro/tutorial.rst b/intro/tutorial.rst index afe420f..933c586 100644 --- a/intro/tutorial.rst +++ b/intro/tutorial.rst @@ -110,10 +110,10 @@ Spider是用户编写用于从单个网站(或者一些网站)爬取数据的类 class DmozSpider(scrapy.spiders.Spider): name = "dmoz" - allowed_domains = ["dmoz.org"] + allowed_domains = ["dmoztools.net"] start_urls = [ - "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", - "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" + "http://dmoztools.net/Computers/Programming/Languages/Python/Books/", + "http://dmoztools.net/Computers/Programming/Languages/Python/Resources/" ] def parse(self, response): @@ -128,7 +128,7 @@ Spider是用户编写用于从单个网站(或者一些网站)爬取数据的类 scrapy crawl dmoz -``crawl dmoz`` 启动用于爬取 ``dmoz.org`` 的spider,您将得到类似的输出:: +``crawl dmoz`` 启动用于爬取 ``dmoztools.net`` 的spider,您将得到类似的输出:: 2014-01-23 18:13:07-0400 [scrapy] INFO: Scrapy started (bot: tutorial) 2014-01-23 18:13:07-0400 [scrapy] INFO: Optional features available: ... @@ -138,8 +138,8 @@ Spider是用户编写用于从单个网站(或者一些网站)爬取数据的类 2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled spider middlewares: ... 2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled item pipelines: ... 2014-01-23 18:13:07-0400 [dmoz] INFO: Spider opened - 2014-01-23 18:13:08-0400 [dmoz] DEBUG: Crawled (200) (referer: None) - 2014-01-23 18:13:09-0400 [dmoz] DEBUG: Crawled (200) (referer: None) + 2014-01-23 18:13:08-0400 [dmoz] DEBUG: Crawled (200) (referer: None) + 2014-01-23 18:13:09-0400 [dmoz] DEBUG: Crawled (200) (referer: None) 2014-01-23 18:13:09-0400 [dmoz] INFO: Closing spider (finished) 查看包含 ``[dmoz]`` 的输出,可以看到输出的log中包含定义在 ``start_urls`` 的初始URL,并且与spider中是一一对应的。在log中可以看到其没有指向其他页面( ``(referer:None)`` )。