From bde8352b4ba5a61935b4d7d0399c063df76951bb Mon Sep 17 00:00:00 2001
From: Chunk <chunkplus@gmail.com>
Date: Sun, 10 May 2015 12:25:32 +0800
Subject: [PATCH] shuffling.

---
 mdata/ILSVRC.py        | 18 ++++++++++++------
 mmodel/caffe/helper.py | 40 +++++++++++++++++++++++++++-------------
 test/test_data.py      | 10 ++++------
 3 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/mdata/ILSVRC.py b/mdata/ILSVRC.py
index 427b37d..8dd8526 100644
--- a/mdata/ILSVRC.py
+++ b/mdata/ILSVRC.py
@@ -299,10 +299,10 @@ class DataILSVRC(DataDumperBase):
                     # if w < 300 or h < 300:
                     # continue
                     # left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
-                    #     img_crop = img[upper:upper + 300, left:left + 300]
-                    #     cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
+                    # img_crop = img[upper:upper + 300, left:left + 300]
+                    # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
                     # except Exception as e:
-                    #     print '[EXCPT]', e
+                    # print '[EXCPT]', e
                     #     pass
 
 
@@ -439,7 +439,7 @@ class DataILSVRC(DataDumperBase):
             pass
 
 
-    def load_data(self, mode='local', feattype='ibd', tagtype='class'):
+    def load_data(self, mode='local', feattype='ibd', tagtype='class', shuffle=False):
         INDEX = []
         X = []
         Y = []
@@ -461,7 +461,8 @@ class DataILSVRC(DataDumperBase):
 
                 for tag, feat in dict_dataset.values():
                     feat.ravel()[[i * 200 + j for i in range(0, 200, 8) for j in range(0, 200, 8)]] = 0
-                    # feat = np.bitwise_and(feat, 1)
+                    feat = np.absolute(feat)
+                    feat = np.bitwise_and(feat, 1)
                     X.append(feat.ravel())
                     Y.append(int(tag))
 
@@ -503,8 +504,13 @@ class DataILSVRC(DataDumperBase):
         else:
             raise Exception("Unknown mode!")
 
-        return X, Y
+        if shuffle:
+            # shuffling
+            Z = zip(X, Y)
+            np.random.shuffle(Z)
+            return Z
 
+        return X, Y
 
 
 
diff --git a/mmodel/caffe/helper.py b/mmodel/caffe/helper.py
index 9b9b368..72083e1 100644
--- a/mmodel/caffe/helper.py
+++ b/mmodel/caffe/helper.py
@@ -61,23 +61,37 @@ def _write_lmdb_raw(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='..
         in_db_data.close()
 
 
-def write_lmdb(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
+def write_lmdb(X, Y=None, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
     """
     X - numpy array of data.
     Y - numpy array of labels.
     """
-    print('writing image data...')
-    for idx in range(int(math.ceil(len(Y) / 1000.0))):
-        in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
-        with in_db_data.begin(write=True) as in_txn:
-            for in_idx, (in_, label_) in enumerate(
-                    zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
-                # im = caffe.io.load_image(in_)
-                im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
-                in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
-
-                print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
-        in_db_data.close()
+    if Y != None:
+        print('writing image data...')
+        for idx in range(int(math.ceil(len(Y) / 1000.0))):
+            in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
+            with in_db_data.begin(write=True) as in_txn:
+                for in_idx, (in_, label_) in enumerate(
+                        zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
+                    # im = caffe.io.load_image(in_)
+                    im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
+                    in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
+
+                    print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
+            in_db_data.close()
+    else:
+        assert isinstance(X[0], tuple)
+        print('writing image data...')
+        for idx in range(int(math.ceil(len(X) / 1000.0))):
+            in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
+            with in_db_data.begin(write=True) as in_txn:
+                for in_idx, (in_, label_) in enumerate(X[(1000 * idx):(1000 * (idx + 1))]):
+                    # im = caffe.io.load_image(in_)
+                    im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
+                    in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
+
+                    print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
+            in_db_data.close()
 
 
 if __name__ == '__main__':
diff --git a/test/test_data.py b/test/test_data.py
index 612ad85..d52258f 100755
--- a/test/test_data.py
+++ b/test/test_data.py
@@ -6,6 +6,7 @@ from ..mdata import MSR, CV, ILSVRC, ILSVRC_S, crop
 
 from ..mmodel.caffe.helper import *
 
+
 def test_MSR():
     dmsr = MSR.DataMSR()
     # msrd.format()
@@ -164,14 +165,11 @@ def test_caffe():
     # return
 
     dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_crop_pil')
-    X, Y = dil.load_data(mode='local', feattype='coef')
+    X = dil.load_data(mode='local', feattype='coef', shuffle=True)
     print X[0]
-    print Y
-    print np.array(X).shape, np.array(Y).shape
-
-    write_lmdb(X[2000:3000],Y[2000:3000])
-
+    print np.array(X).shape
 
+    write_lmdb(X[7000:])
 
 
 if __name__ == '__main__':
--
libgit2 0.21.2