diff --git a/crawler/crawler/__init__.pyc b/crawler/crawler/__init__.pyc index 499c3cd..70d08a9 100644 Binary files a/crawler/crawler/__init__.pyc and b/crawler/crawler/__init__.pyc differ diff --git a/crawler/crawler/input.txt b/crawler/crawler/input.txt new file mode 100644 index 0000000..63e7ce7 --- /dev/null +++ b/crawler/crawler/input.txt @@ -0,0 +1,2 @@ +1 +www.cs.tsinghua.edu.cn diff --git a/crawler/crawler/items.pyc b/crawler/crawler/items.pyc index 39d7b2f..b940bad 100644 Binary files a/crawler/crawler/items.pyc and b/crawler/crawler/items.pyc differ diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py index c18b6dc..470a2c7 100644 --- a/crawler/crawler/settings.py +++ b/crawler/crawler/settings.py @@ -16,6 +16,6 @@ NEWSPIDER_MODULE = 'crawler.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'crawler (+http://www.yourdomain.com)' -ITEM_PIPELINES = { - 'crawler.pipelines.CrawlerPipeline': 300 -} +#ITEM_PIPELINES = { +# 'crawler.pipelines.CrawlerPipeline': 300 +#} diff --git a/crawler/crawler/settings.pyc b/crawler/crawler/settings.pyc index f48b6cc..ebe6158 100644 Binary files a/crawler/crawler/settings.pyc and b/crawler/crawler/settings.pyc differ diff --git a/crawler/crawler/spiders/.ImgCrawler.py.swp b/crawler/crawler/spiders/.ImgCrawler.py.swp new file mode 100644 index 0000000..9799eab Binary files /dev/null and b/crawler/crawler/spiders/.ImgCrawler.py.swp differ diff --git a/crawler/crawler/spiders/ImgCrawler.py b/crawler/crawler/spiders/ImgCrawler.py index 9b596a5..730af1a 100644 --- a/crawler/crawler/spiders/ImgCrawler.py +++ b/crawler/crawler/spiders/ImgCrawler.py @@ -1,13 +1,23 @@ -from scrapy.spider import BaseSpider +from scrapy.spider import BaseSpider, Spider from scrapy.http import Request from crawler.items import CrawlerItem import re -class Crawler(BaseSpider): +class Crawler(Spider): #Scrapy 0.22 +#class Crawler(BaseSpider): #Scrapy 0.18 name = 'img' - allowed_domains = ['www.cs.tsinghua.edu.cn'] - start_urls = ['http://www.cs.tsinghua.edu.cn'] - urlSet = set() + + def __init__(self): + self.urlSet = set() + fr = open('input.txt').readlines() + self.mode = int(fr[0].strip()) + self.tmp = fr[1].strip() + if self.mode == 1: + self.allowed_domains = [self.tmp,] + self.start_urls = ['http://' + self.tmp, ] + else: + self.allowed_domains = [] + self.start_urls = [] def parse(self, response): html = response.body @@ -21,13 +31,12 @@ class Crawler(BaseSpider): for url in urlList: if url not in self.urlSet: - yield Request('http://www.cs.tsinghua.edu.cn' + url.strip('"'), self.parse) - #pass + yield Request('http://' + self.allowed_domains[0] + url.strip('"'), self.parse) item = CrawlerItem() item['url'] = response.url item['imgList'] = [] for img in imgList: - item['imgList'].append('http://www.cs.tsinghua.edu.cn' + img.strip('"')) + item['imgList'].append(img.strip('"')) yield item diff --git a/crawler/crawler/spiders/ImgCrawler.pyc b/crawler/crawler/spiders/ImgCrawler.pyc index 6ea0851..97fec19 100644 Binary files a/crawler/crawler/spiders/ImgCrawler.pyc and b/crawler/crawler/spiders/ImgCrawler.pyc differ diff --git a/crawler/crawler/spiders/__init__.pyc b/crawler/crawler/spiders/__init__.pyc index e9ef2b7..d5ec17c 100644 Binary files a/crawler/crawler/spiders/__init__.pyc and b/crawler/crawler/spiders/__init__.pyc differ diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..7a959b3 --- /dev/null +++ b/run.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +if [ $# -lt 2 ]; then + echo 'Usage:' + echo '1. run 1 allowed_domain' + echo 'crawl all imgs in the allowed_domain' + echo '2. run 2 input_file' + echo 'crawl the imgs and attach the corresponding tags in input_file' + exit 0 +fi + +cd crawler/crawler +input_file=input.txt + +if [ $1 = 1 ]; then + echo $1 > ${input_file} + echo $2 >> ${input_file} + scrapy crawl img +elif [ $1 = 2 ]; then + echo $1 > ${input_file} + echo $2 >> ${input_file} + scrapy crawl img +else + echo 'Invalid mode!' +fi -- libgit2 0.21.2