Commit 4737e1662edbd91e50699151b52004c479a429e1

Authored by Chunk
1 parent 9a5cac33

staged.

.idea/ImageR.iml
... ... @@ -2,7 +2,7 @@
2 2 <module type="PYTHON_MODULE" version="4">
3 3 <component name="NewModuleRootManager">
4 4 <content url="file://$MODULE_DIR$" />
5   - <orderEntry type="jdk" jdkName="Python 2.7.8 virtualenv at ~/.virtualenvs/env1" jdkType="Python SDK" />
  5 + <orderEntry type="jdk" jdkName="Python 2.7.6 virtualenv at ~/.virtualenvs/env0" jdkType="Python SDK" />
6 6 <orderEntry type="sourceFolder" forTests="false" />
7 7 </component>
8 8 </module>
9 9 \ No newline at end of file
... ...
... ... @@ -1,63 +0,0 @@
1   -#!/bin/zsh
2   -# chunk @ 2014
3   -########################################################################################
4   -##
5   -## F**k World!
6   -##
7   -##
8   -## export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:`hbase classpath`
9   -## export SPARK_CLASSPATH=$SPARK_CLASSPATH:`hbase classpath`
10   -##
11   -## spark-submit \
12   -## --driver-memory 1g \
13   -## --executor-memory 1g \
14   -## --executor-cores 2 \
15   -## --deploy-mode client \
16   -## --master yarn \
17   -## --class "FuckWorld" \
18   -## $APP_JAR $ARGS
19   -##
20   -## spark-class org.apache.spark.deploy.yarn.Client \
21   -## --num-executors 2 \
22   -## --executor-cores 2 \
23   -## --driver-memory 1g \
24   -## --executor-memory 1g \
25   -## --name "F**k World" \
26   -## --jar $APP_JAR \
27   -## --class "FuckWorld" \
28   -## --args $ARGS
29   -##
30   -##spark-submit \
31   -## --driver-memory 1g \
32   -## --executor-memory 1g \
33   -## --executor-cores 2 \
34   -## --master spark://HPC-server:7077 \
35   -## --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \
36   -## $APP_JAR $ARGS
37   -########################################################################################
38   -
39   -source /home/hadoop/.zshrc
40   -v env1
41   -
42   -export PYSPARK_PYTHON=/home/hadoop/.virtualenvs/env1/bin/python
43   -export SPARK_CLASSPATH=`hbase classpath`
44   -export SPARK_JAR=hdfs://HPC-server:9000/user/spark/share/lib/spark-assembly-1.2.0-hadoop2.5.1.jar
45   -
46   -#COMPRESSED=/home/hadoop/workspace/pycharm/test/ImageR.zip
47   -# --py-files $COMPRESSED \
48   -COMPRESSED=/home/hadoop/workspace/pycharm/tmp/ImageR/mdata.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mfeat.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mjpeg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/msteg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mmodel.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mspark.zip
49   -
50   -APP=test_spark.py
51   -#APP=test_model.py
52   -ARGS=
53   -
54   -spark-submit \
55   - --driver-memory 1g \
56   - --executor-memory 2g \
57   - --executor-cores 2 \
58   - --master spark://HPC-server:7077 \
59   - --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \
60   - --py-files $COMPRESSED \
61   - $APP $ARGS
62   -
63   -
run_spark.sh 0 → 100755
... ... @@ -0,0 +1,63 @@
  1 +#!/bin/zsh
  2 +# chunk @ 2014
  3 +########################################################################################
  4 +##
  5 +## F**k World!
  6 +##
  7 +##
  8 +## export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:`hbase classpath`
  9 +## export SPARK_CLASSPATH=$SPARK_CLASSPATH:`hbase classpath`
  10 +##
  11 +## spark-submit \
  12 +## --driver-memory 1g \
  13 +## --executor-memory 1g \
  14 +## --executor-cores 2 \
  15 +## --deploy-mode client \
  16 +## --master yarn \
  17 +## --class "FuckWorld" \
  18 +## $APP_JAR $ARGS
  19 +##
  20 +## spark-class org.apache.spark.deploy.yarn.Client \
  21 +## --num-executors 2 \
  22 +## --executor-cores 2 \
  23 +## --driver-memory 1g \
  24 +## --executor-memory 1g \
  25 +## --name "F**k World" \
  26 +## --jar $APP_JAR \
  27 +## --class "FuckWorld" \
  28 +## --args $ARGS
  29 +##
  30 +##spark-submit \
  31 +## --driver-memory 1g \
  32 +## --executor-memory 1g \
  33 +## --executor-cores 2 \
  34 +## --master spark://HPC-server:7077 \
  35 +## --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \
  36 +## $APP_JAR $ARGS
  37 +########################################################################################
  38 +
  39 +source /home/hadoop/.zshrc
  40 +v env1
  41 +
  42 +export PYSPARK_PYTHON=/home/hadoop/.virtualenvs/env1/bin/python
  43 +export SPARK_CLASSPATH=`hbase classpath`
  44 +export SPARK_JAR=hdfs://HPC-server:9000/user/spark/share/lib/spark-assembly-1.2.0-hadoop2.5.1.jar
  45 +
  46 +#COMPRESSED=/home/hadoop/workspace/pycharm/test/ImageR.zip
  47 +# --py-files $COMPRESSED \
  48 +COMPRESSED=/home/hadoop/workspace/pycharm/tmp/ImageR/mdata.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mfeat.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mjpeg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/msteg.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mmodel.zip,/home/hadoop/workspace/pycharm/tmp/ImageR/mspark.zip
  49 +
  50 +APP=test_spark.py
  51 +#APP=test_model.py
  52 +ARGS=
  53 +
  54 +spark-submit \
  55 + --driver-memory 1g \
  56 + --executor-memory 2g \
  57 + --executor-cores 2 \
  58 + --master spark://HPC-server:7077 \
  59 + --jars $SPARK_HOME/lib/spark-examples-1.2.0-hadoop2.5.1.jar \
  60 + --py-files $COMPRESSED \
  61 + $APP $ARGS
  62 +
  63 +
... ...
run_spider.sh 0 → 100755
... ... @@ -0,0 +1,23 @@
  1 +#!/bin/zsh
  2 +# chunk @ 2014
  3 +
  4 +####################################################################
  5 +## environment variables
  6 +####################################################################
  7 +export export TERM=xterm
  8 +source /home/hadoop/.zshrc
  9 +v env0
  10 +
  11 +####################################################################
  12 +## additional files list
  13 +####################################################################
  14 +FILE=hehe.json
  15 +
  16 +#scrapy runspider spider/test.py
  17 +cd ./spider/mspider/
  18 +[ -f $FILE ] && rm $FILE
  19 +scrapy crawl dmoz -o $FILE
  20 +
  21 +
  22 +
  23 +
... ...
spider/__init__.py
1   -__author__ = 'hadoop'
  1 +__author__ = 'chunk'
... ...
spider/mspider/hehe.json 0 → 100644
... ... @@ -0,0 +1,44 @@
  1 +[{"link": ["/"], "title": ["Top"]},
  2 +{"link": ["/Computers/"], "title": ["Computers"]},
  3 +{"link": ["/Computers/Programming/"], "title": ["Programming"]},
  4 +{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]},
  5 +{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]},
  6 +{"link": [], "title": []},
  7 +{"link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]},
  8 +{"link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]},
  9 +{"link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]},
  10 +{"link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]},
  11 +{"link": ["https://www.python.org/dev/"], "title": ["Python Developer's Guide"]},
  12 +{"link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]},
  13 +{"link": ["/"], "title": ["Top"]},
  14 +{"link": ["/Computers/"], "title": ["Computers"]},
  15 +{"link": ["/Computers/Programming/"], "title": ["Programming"]},
  16 +{"link": ["/Computers/Programming/Languages/"], "title": ["Languages"]},
  17 +{"link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]},
  18 +{"link": [], "title": []},
  19 +{"link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]},
  20 +{"link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]},
  21 +{"link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher"], "title": ["Deutsch"]},
  22 +{"link": ["/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8"], "title": ["\u0420\u0443\u0441\u0441\u043a\u0438\u0439"]},
  23 +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]},
  24 +{"link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]},
  25 +{"link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]},
  26 +{"link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]},
  27 +{"link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]},
  28 +{"link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]},
  29 +{"link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]},
  30 +{"link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]},
  31 +{"link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]},
  32 +{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]},
  33 +{"link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]},
  34 +{"link": ["http://sysadminpy.com/"], "title": ["Pro Python System Administration"]},
  35 +{"link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]},
  36 +{"link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]},
  37 +{"link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]},
  38 +{"link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]},
  39 +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]},
  40 +{"link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]},
  41 +{"link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]},
  42 +{"link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]},
  43 +{"link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]},
  44 +{"link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}]
0 45 \ No newline at end of file
... ...
spider/mspider/mspider/__init__.py 0 → 100644
spider/mspider/mspider/__init__.pyc 0 → 100644
No preview for this file type
spider/mspider/mspider/items.py 0 → 100644
... ... @@ -0,0 +1,19 @@
  1 +# -*- coding: utf-8 -*-
  2 +
  3 +# Define here the models for your scraped items
  4 +#
  5 +# See documentation in:
  6 +# http://doc.scrapy.org/en/latest/topics/items.html
  7 +
  8 +import scrapy
  9 +
  10 +
  11 +class MspiderItem(scrapy.Item):
  12 + # define the fields for your item here like:
  13 + # name = scrapy.Field()
  14 + pass
  15 +
  16 +class DmozItem(scrapy.Item):
  17 + title = scrapy.Field()
  18 + link = scrapy.Field()
  19 + desc = scrapy.Field()
... ...
spider/mspider/mspider/items.pyc 0 → 100644
No preview for this file type
spider/mspider/mspider/pipelines.py 0 → 100644
... ... @@ -0,0 +1,11 @@
  1 +# -*- coding: utf-8 -*-
  2 +
  3 +# Define your item pipelines here
  4 +#
  5 +# Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 +
  8 +
  9 +class MspiderPipeline(object):
  10 + def process_item(self, item, spider):
  11 + return item
... ...
spider/mspider/mspider/settings.py 0 → 100644
... ... @@ -0,0 +1,17 @@
  1 +# -*- coding: utf-8 -*-
  2 +
  3 +# Scrapy settings for mspider project
  4 +#
  5 +# For simplicity, this file contains only the most important settings by
  6 +# default. All the other settings are documented here:
  7 +#
  8 +# http://doc.scrapy.org/en/latest/topics/settings.html
  9 +#
  10 +
  11 +BOT_NAME = 'mspider'
  12 +
  13 +SPIDER_MODULES = ['mspider.spiders']
  14 +NEWSPIDER_MODULE = 'mspider.spiders'
  15 +
  16 +# Crawl responsibly by identifying yourself (and your website) on the user-agent
  17 +#USER_AGENT = 'mspider (+http://www.yourdomain.com)'
... ...
spider/mspider/mspider/settings.pyc 0 → 100644
No preview for this file type
spider/mspider/mspider/spiders/__init__.py 0 → 100644
... ... @@ -0,0 +1,4 @@
  1 +# This package will contain the spiders of your Scrapy project
  2 +#
  3 +# Please refer to the documentation for information on how to create and manage
  4 +# your spiders.
... ...
spider/mspider/mspider/spiders/__init__.pyc 0 → 100644
No preview for this file type
spider/mspider/mspider/spiders/test000.py 0 → 100644
... ... @@ -0,0 +1,27 @@
  1 +__author__ = 'chunk'
  2 +
  3 +import scrapy
  4 +from ..items import DmozItem
  5 +
  6 +class DmozSpider(scrapy.Spider):
  7 + name = "dmoz"
  8 + allowed_domains = ["dmoz.org"]
  9 + start_urls = [
  10 + "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
  11 + "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
  12 + ]
  13 +
  14 + def parse(self, response):
  15 + """
  16 + This is the default callback used by Scrapy to process downloaded responses
  17 + The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow.
  18 + This method, as well as any other Request callback, must return an iterable of Request and/or Item objects.
  19 +
  20 + Ref - http://doc.scrapy.org/en/0.24/topics/spiders.html
  21 + """
  22 + for sel in response.xpath('//ul/li'):
  23 + item = DmozItem()
  24 + item['title'] = sel.xpath('a/text()').extract()
  25 + item['link'] = sel.xpath('a/@href').extract()
  26 +
  27 + yield item
... ...
spider/mspider/mspider/spiders/test000.pyc 0 → 100644
No preview for this file type
spider/mspider/scrapy.cfg 0 → 100644
... ... @@ -0,0 +1,11 @@
  1 +# Automatically created by: scrapy startproject
  2 +#
  3 +# For more information about the [deploy] section see:
  4 +# http://doc.scrapy.org/en/latest/topics/scrapyd.html
  5 +
  6 +[settings]
  7 +default = mspider.settings
  8 +
  9 +[deploy]
  10 +#url = http://localhost:6800/
  11 +project = mspider
... ...
spider/test.py
... ... @@ -1,19 +0,0 @@
1   -__author__ = 'chunk'
2   -
3   -
4   -
5   -
6   -
7   -
8   -
9   -
10   -
11   -
12   -
13   -
14   -
15   -
16   -
17   -
18   -
19   -