From 5426d9db04ff39181aca5bd19a3c98cf35fab3a7 Mon Sep 17 00:00:00 2001 From: Chunk Date: Wed, 8 Jul 2015 11:44:41 +0800 Subject: [PATCH] almost finished. --- mspark/SC.py | 3 ++- test/test_data.py | 10 +++++----- test/test_model.py | 60 ++++-------------------------------------------------------- 3 files changed, 11 insertions(+), 62 deletions(-) diff --git a/mspark/SC.py b/mspark/SC.py index 3c9d42c..654201b 100644 --- a/mspark/SC.py +++ b/mspark/SC.py @@ -4,6 +4,7 @@ __author__ = 'chunk' from ..common import * from .dependencies import * from . import * +import rdd from .rdd import * import sys @@ -110,7 +111,7 @@ class Sparker(object): rdd_data = data rdd_data.flatMap( - lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( + lambda x: rdd.format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( conf=hconf, keyConverter=hparams["writeKeyConverter"], valueConverter=hparams["writeValueConverter"]) diff --git a/test/test_data.py b/test/test_data.py index 516a571..b4d2dd7 100755 --- a/test/test_data.py +++ b/test/test_data.py @@ -47,17 +47,17 @@ def test_ILSVRC(category='Train_100'): timer.report() -def test_ILSVRC_S_LOCAL(): +def test_ILSVRC_S_LOCAL(category='Train_100'): timer = Timer() timer.mark() - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category) dil.delete_table() dil.format() dil.store_img() timer.report() - dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') + dils = ILSVRC_S.DataILSVRC_S(base='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category=category) # dils._extract_data(mode='hbase', writeback=True) # dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True) @@ -90,7 +90,7 @@ def test_ILSVRC_S_SPARK(category='Train_100'): timer.report() -def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): +def test_ILSVRC_S_ANALYSIS(category='Train_100', tablename=None): timer = Timer() timer.mark() @@ -112,7 +112,7 @@ def test_ILSVRC_S_ANALYSIS(category='Train_1000', tablename=None): timer.report() -def test_ILSVRC_S_ANALYSIS2(category='Train_1000', tablename='MSPIDER'): +def test_ILSVRC_S_ANALYSIS2(category='Train_100', tablename='MSPIDER'): timer = Timer() # timer.mark() diff --git a/test/test_model.py b/test/test_model.py index 4ee93e2..94e38b2 100755 --- a/test/test_model.py +++ b/test/test_model.py @@ -3,9 +3,8 @@ __author__ = 'chunk' from sklearn import cross_validation from pyspark.mllib.regression import LabeledPoint from ..common import * -from ..mdata import CV, ILSVRC, ILSVRC_S +from ..mdata import ILSVRC, ILSVRC_S from ..mmodel.svm import SVM -from ..mmodel.theano import THEANO import gzip import cPickle @@ -15,36 +14,6 @@ timer = Timer() package_dir = os.path.dirname(os.path.abspath(__file__)) -def test_SVM_CV(): - timer.mark() - dcv = CV.DataCV() - X, Y = dcv.load_data(mode='local') # 90.468586s -> 5.392520s - # X, Y = dcv.load_data(mode='hbase') # 21.682754s - # X, Y = dcv.load_data(mode='spark') # 29.549597s - timer.report() - - timer.mark() - # msvm = SVM.ModelSVM(toolset='sklearn') # 3.030380s - # msvm = SVM.ModelSVM(toolset='opencv') # 8.939880s - # msvm = SVM.ModelSVM(toolset='libsvm') # 185.524023s - msvm = SVM.ModelSVM(toolset='spark') - - msvm.train(X, Y) - timer.report() - - timer.mark() - for path, subdirs, files in os.walk('data/467/'): - for name in files: - imgpath = os.path.join(path, name) - feat = dcv.get_feat(imgpath, 'hog') - print name, msvm.predict(feat) - timer.report() - - timer.mark() - print msvm.test(X, Y) # 0.948892561983 for svm_cv, 0.989024793388 for svm_sk, 0.9900826446280992 for svm_lib - timer.report() # 27.421949s for svm_lib - - def test_SVM_ILSVRC(): timer.mark() dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_0.05_orig') @@ -119,6 +88,7 @@ def test_SVM_ILSVRC_HBASE(): # print scores # timer.report() + def test_SVM_ILSVRC_TEST(): timer.mark() @@ -131,7 +101,7 @@ def test_SVM_ILSVRC_TEST(): timer.report() timer.mark() - print msvm.test(X1, Y1) #(0.048868415782094936, 0.4924709948160948, 0.74568774878372401) + print msvm.test(X1, Y1) # (0.048868415782094936, 0.4924709948160948, 0.74568774878372401) timer.report() # # timer.mark() # print 'or like this:' @@ -145,7 +115,7 @@ def test_SVM_ILSVRC_SPARK(): dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Train_5000') # rdd_dataset = dils.load_data(mode='spark') # pass X, Y = dils.load_data(mode='hbase') # pass - rdd_dataset = dils.sparker.sc.parallelize(zip(Y,X), 30).map(lambda x: LabeledPoint(x[0], x[1])) + rdd_dataset = dils.sparker.sc.parallelize(zip(Y, X), 30).map(lambda x: LabeledPoint(x[0], x[1])) timer.report() timer.mark() @@ -171,28 +141,6 @@ def test_SVM_ILSVRC_S(): # test_SVM_ILSVRC_SPARK() -def test_THEANO_mnist(): - mtheano = THEANO.ModelTHEANO(toolset='cnn') - mtheano._train_cnn(learning_rate=0.1, n_epochs=200, dataset=os.path.join(package_dir, '../res/', 'mnist.pkl.gz'), nkerns=[20, 50], batch_size=500) - - -def test_THEANO_crop(): - timer.mark() - dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_crop_pil') - X, Y = dilc.load_data(mode='local', feattype='coef') - print X[0],Y - timer.report() - - # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=0) - # with open(os.path.join(package_dir,'../res/','ils_crop.pkl'),'wb') as f: - # cPickle.dump([(X_train,Y_train),(X_test,Y_test)], f) - - timer.mark() - mtheano = THEANO.ModelTHEANO(toolset='cnn') - mtheano._train_cnn(X, Y) - timer.report() - - if __name__ == '__main__': # test_SVM_CV() test_SVM_ILSVRC() -- libgit2 0.21.2