Commit 4737e1662edbd91e50699151b52004c479a429e1
1 parent
9a5cac33
Exists in
master
and in
2 other branches
staged.
Showing
19 changed files
with
221 additions
and
84 deletions
Show diff stats
.idea/ImageR.iml
... | ... | @@ -2,7 +2,7 @@ |
2 | 2 | <module type="PYTHON_MODULE" version="4"> |
3 | 3 | <component name="NewModuleRootManager"> |
4 | 4 | <content url="file://$MODULE_DIR$" /> |
5 | - <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" /> | |
5 | + <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" /> | |
6 | 6 | <orderEntry type="sourceFolder" forTests="false" /> |
7 | 7 | </component> |
8 | 8 | </module> |
9 | 9 | \ No newline at end of file | ... | ... |
run.sh
... | ... | @@ -1,63 +0,0 @@ |
1 | -#!/bin/zsh | |
2 | -# chunk @ 2014 | |
3 | -######################################################################################## | |
4 | -## | |
5 | -## F**k World! | |
6 | -## | |
7 | -## | |
8 | -## export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:`hbase classpath` | |
9 | -## export SPARK_CLASSPATH=$SPARK_CLASSPATH:`hbase classpath` | |
10 | -## | |
11 | -## spark-submit \ | |
12 | -## --driver-memory 1g \ | |
13 | -## --executor-memory 1g \ | |
14 | -## --executor-cores 2 \ | |
15 | -## --deploy-mode client \ | |
16 | -## --master yarn \ | |
17 | -## --class "FuckWorld" \ | |
18 | -## $APP_JAR $ARGS | |
19 | -## | |
20 | -## spark-class org.apache.spark.deploy.yarn.Client \ | |
21 | -## --num-executors 2 \ | |
22 | -## --executor-cores 2 \ | |
23 | -## --driver-memory 1g \ | |
24 | -## --executor-memory 1g \ | |
25 | -## --name "F**k World" \ | |
26 | -## --jar $APP_JAR \ | |
27 | -## --class "FuckWorld" \ | |
28 | -## --args $ARGS | |
29 | -## | |
30 | -##spark-submit \ | |
31 | -## --driver-memory 1g \ | |
32 | -## --executor-memory 1g \ | |
33 | -## --executor-cores 2 \ | |
34 | -## --master spark://HPC-server:7077 \ | |
35 | -## --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | |
36 | -## $APP_JAR $ARGS | |
37 | -######################################################################################## | |
38 | - | |
39 | -source /home/hadoop/.zshrc | |
40 | -v env1 | |
41 | - | |
42 | -export PYSPARK_PYTHON=/home/hadoop/.virtualenvs/env1/bin/python | |
43 | -export SPARK_CLASSPATH=`hbase classpath` | |
44 | -export SPARK_JAR=hdfs://HPC-server:9000/user/spark/share/lib/spark-assembly-1.2.0-hadoop2.5.1.jar | |
45 | - | |
46 | -#COMPRESSED=/home/hadoop/workspace/pycharm/test/ImageR.zip | |
47 | -# --py-files $COMPRESSED \ | |
48 | -COMPRESSED=/home/hadoop/workspace/pycharm/tmp/ImageR/mdata.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mfeat.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mjpeg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/msteg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mmodel.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mspark.zip | |
49 | - | |
50 | -APP=test_spark.py | |
51 | -#APP=test_model.py | |
52 | -ARGS= | |
53 | - | |
54 | -spark-submit \ | |
55 | - --driver-memory 1g \ | |
56 | - --executor-memory 2g \ | |
57 | - --executor-cores 2 \ | |
58 | - --master spark://HPC-server:7077 \ | |
59 | - --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | |
60 | - --py-files $COMPRESSED \ | |
61 | - $APP $ARGS | |
62 | - | |
63 | - |
... | ... | @@ -0,0 +1,63 @@ |
1 | +#!/bin/zsh | |
2 | +# chunk @ 2014 | |
3 | +######################################################################################## | |
4 | +## | |
5 | +## F**k World! | |
6 | +## | |
7 | +## | |
8 | +## export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:`hbase classpath` | |
9 | +## export SPARK_CLASSPATH=$SPARK_CLASSPATH:`hbase classpath` | |
10 | +## | |
11 | +## spark-submit \ | |
12 | +## --driver-memory 1g \ | |
13 | +## --executor-memory 1g \ | |
14 | +## --executor-cores 2 \ | |
15 | +## --deploy-mode client \ | |
16 | +## --master yarn \ | |
17 | +## --class "FuckWorld" \ | |
18 | +## $APP_JAR $ARGS | |
19 | +## | |
20 | +## spark-class org.apache.spark.deploy.yarn.Client \ | |
21 | +## --num-executors 2 \ | |
22 | +## --executor-cores 2 \ | |
23 | +## --driver-memory 1g \ | |
24 | +## --executor-memory 1g \ | |
25 | +## --name "F**k World" \ | |
26 | +## --jar $APP_JAR \ | |
27 | +## --class "FuckWorld" \ | |
28 | +## --args $ARGS | |
29 | +## | |
30 | +##spark-submit \ | |
31 | +## --driver-memory 1g \ | |
32 | +## --executor-memory 1g \ | |
33 | +## --executor-cores 2 \ | |
34 | +## --master spark://HPC-server:7077 \ | |
35 | +## --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | |
36 | +## $APP_JAR $ARGS | |
37 | +######################################################################################## | |
38 | + | |
39 | +source /home/hadoop/.zshrc | |
40 | +v env1 | |
41 | + | |
42 | +export PYSPARK_PYTHON=/home/hadoop/.virtualenvs/env1/bin/python | |
43 | +export SPARK_CLASSPATH=`hbase classpath` | |
44 | +export SPARK_JAR=hdfs://HPC-server:9000/user/spark/share/lib/spark-assembly-1.2.0-hadoop2.5.1.jar | |
45 | + | |
46 | +#COMPRESSED=/home/hadoop/workspace/pycharm/test/ImageR.zip | |
47 | +# --py-files $COMPRESSED \ | |
48 | +COMPRESSED=/home/hadoop/workspace/pycharm/tmp/ImageR/mdata.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mfeat.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mjpeg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/msteg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mmodel.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mspark.zip | |
49 | + | |
50 | +APP=test_spark.py | |
51 | +#APP=test_model.py | |
52 | +ARGS= | |
53 | + | |
54 | +spark-submit \ | |
55 | + --driver-memory 1g \ | |
56 | + --executor-memory 2g \ | |
57 | + --executor-cores 2 \ | |
58 | + --master spark://HPC-server:7077 \ | |
59 | + --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \ | |
60 | + --py-files $COMPRESSED \ | |
61 | + $APP $ARGS | |
62 | + | |
63 | + | ... | ... |
... | ... | @@ -0,0 +1,23 @@ |
1 | +#!/bin/zsh | |
2 | +# chunk @ 2014 | |
3 | + | |
4 | +#################################################################### | |
5 | +## environment variables | |
6 | +#################################################################### | |
7 | +export export TERM=xterm | |
8 | +source /home/hadoop/.zshrc | |
9 | +v env0 | |
10 | + | |
11 | +#################################################################### | |
12 | +## additional files list | |
13 | +#################################################################### | |
14 | +FILE=hehe.json | |
15 | + | |
16 | +#scrapy runspider spider/test.py | |
17 | +cd ./spider/mspider/ | |
18 | +[ -f $FILE ] && rm $FILE | |
19 | +scrapy crawl dmoz -o $FILE | |
20 | + | |
21 | + | |
22 | + | |
23 | + | ... | ... |
spider/__init__.py
... | ... | @@ -0,0 +1,44 @@ |
1 | +[{"link": ["/"], "title": ["Top"]}, | |
2 | +{"link": ["/Computers/"], "title": ["Computers"]}, | |
3 | +{"link": ["/Computers/Programming/"], "title": ["Programming"]}, | |
4 | +{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, | |
5 | +{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, | |
6 | +{"link": [], "title": []}, | |
7 | +{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]}, | |
8 | +{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]}, | |
9 | +{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]}, | |
10 | +{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]}, | |
11 | +{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]}, | |
12 | +{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]}, | |
13 | +{"link": ["/"], "title": ["Top"]}, | |
14 | +{"link": ["/Computers/"], "title": ["Computers"]}, | |
15 | +{"link": ["/Computers/Programming/"], "title": ["Programming"]}, | |
16 | +{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, | |
17 | +{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, | |
18 | +{"link": [], "title": []}, | |
19 | +{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]}, | |
20 | +{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]}, | |
21 | +{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]}, | |
22 | +{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]}, | |
23 | +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]}, | |
24 | +{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]}, | |
25 | +{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]}, | |
26 | +{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]}, | |
27 | +{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]}, | |
28 | +{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]}, | |
29 | +{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]}, | |
30 | +{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]}, | |
31 | +{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]}, | |
32 | +{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]}, | |
33 | +{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]}, | |
34 | +{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]}, | |
35 | +{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]}, | |
36 | +{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]}, | |
37 | +{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]}, | |
38 | +{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]}, | |
39 | +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]}, | |
40 | +{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]}, | |
41 | +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]}, | |
42 | +{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]}, | |
43 | +{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]}, | |
44 | +{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}] | |
0 | 45 | \ No newline at end of file | ... | ... |
No preview for this file type
... | ... | @@ -0,0 +1,19 @@ |
1 | +# -*- coding: utf-8 -*- | |
2 | + | |
3 | +# Define here the models for your scraped items | |
4 | +# | |
5 | +# See documentation in: | |
6 | +# http://doc.scrapy.org/en/latest/topics/items.html | |
7 | + | |
8 | +import scrapy | |
9 | + | |
10 | + | |
11 | +class MspiderItem(scrapy.Item): | |
12 | + # define the fields for your item here like: | |
13 | + # name = scrapy.Field() | |
14 | + pass | |
15 | + | |
16 | +class DmozItem(scrapy.Item): | |
17 | + title = scrapy.Field() | |
18 | + link = scrapy.Field() | |
19 | + desc = scrapy.Field() | ... | ... |
No preview for this file type
... | ... | @@ -0,0 +1,11 @@ |
1 | +# -*- coding: utf-8 -*- | |
2 | + | |
3 | +# Define your item pipelines here | |
4 | +# | |
5 | +# Don't forget to add your pipeline to the ITEM_PIPELINES setting | |
6 | +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | |
7 | + | |
8 | + | |
9 | +class MspiderPipeline(object): | |
10 | + def process_item(self, item, spider): | |
11 | + return item | ... | ... |
... | ... | @@ -0,0 +1,17 @@ |
1 | +# -*- coding: utf-8 -*- | |
2 | + | |
3 | +# Scrapy settings for mspider project | |
4 | +# | |
5 | +# For simplicity, this file contains only the most important settings by | |
6 | +# default. All the other settings are documented here: | |
7 | +# | |
8 | +# http://doc.scrapy.org/en/latest/topics/settings.html | |
9 | +# | |
10 | + | |
11 | +BOT_NAME = 'mspider' | |
12 | + | |
13 | +SPIDER_MODULES = ['mspider.spiders'] | |
14 | +NEWSPIDER_MODULE = 'mspider.spiders' | |
15 | + | |
16 | +# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
17 | +#USER_AGENT = 'mspider (+http://www.yourdomain.com)' | ... | ... |
No preview for this file type
No preview for this file type
... | ... | @@ -0,0 +1,27 @@ |
1 | +__author__ = 'chunk' | |
2 | + | |
3 | +import scrapy | |
4 | +from ..items import DmozItem | |
5 | + | |
6 | +class DmozSpider(scrapy.Spider): | |
7 | + name = "dmoz" | |
8 | + allowed_domains = ["dmoz.org"] | |
9 | + start_urls = [ | |
10 | + "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", | |
11 | + "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" | |
12 | + ] | |
13 | + | |
14 | + def parse(self, response): | |
15 | + """ | |
16 | + This is the default callback used by Scrapy to process downloaded responses | |
17 | + The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow. | |
18 | + This method, as well as any other Request callback, must return an iterable of Request and/or Item objects. | |
19 | + | |
20 | + Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html | |
21 | + """ | |
22 | + for sel in response.xpath('//ul/li'): | |
23 | + item = DmozItem() | |
24 | + item['title'] = sel.xpath('a/text()').extract() | |
25 | + item['link'] = sel.xpath('a/@href').extract() | |
26 | + | |
27 | + yield item | ... | ... |
No preview for this file type
... | ... | @@ -0,0 +1,11 @@ |
1 | +# Automatically created by: scrapy startproject | |
2 | +# | |
3 | +# For more information about the [deploy] section see: | |
4 | +# http://doc.scrapy.org/en/latest/topics/scrapyd.html | |
5 | + | |
6 | +[settings] | |
7 | +default = mspider.settings | |
8 | + | |
9 | +[deploy] | |
10 | +#url = http://localhost:6800/ | |
11 | +project = mspider | ... | ... |