diff --git a/mmodel/theano/THEANO.py b/mmodel/theano/THEANO.py
index 7074ba1..9b7006a 100644
--- a/mmodel/theano/THEANO.py
+++ b/mmodel/theano/THEANO.py
@@ -37,38 +37,223 @@ class ModelTHEANO(ModelBase):
         self.sparker = sc
         self.model = None
 
-    def _shared_dataset(self, data_xy, borrow=True):
-        """ Function that loads the dataset into shared variables
-
-        The reason we store our dataset in shared variables is to allow
-        Theano to copy it into the GPU memory (when code is run on GPU).
-        Since copying data into the GPU is slow, copying a minibatch everytime
-        is needed (the default behaviour if the data is not in a shared
-        variable) would lead to a large decrease in performance.
-        """
-        data_x, data_y = data_xy
-        shared_x = theano.shared(np.asarray(data_x,
-                                            dtype=theano.config.floatX),
-                                 borrow=borrow)
-        shared_y = theano.shared(np.asarray(data_y,
-                                            dtype=theano.config.floatX),
-                                 borrow=borrow)
-        # When storing data on the GPU it has to be stored as floats
-        # therefore we will store the labels as ``floatX`` as well
-        # (``shared_y`` does exactly that). But during our computations
-        # we need them as ints (we use labels as index, and if they are
-        # floats it doesn't make sense) therefore instead of returning
-        # ``shared_y`` we will have to cast it to int. This little hack
-        # lets ous get around this issue
-        return shared_x, T.cast(shared_y, 'int32')
-
-    def _train_cnn(self, X=None, Y=None, dataset=os.path.join(package_dir, '../../res/', 'ils_crop.pkl'),
+    def _train_cnn(self, X=None, Y=None, dataset=os.path.join(package_dir, '../../res/', 'mnist.pkl.gz'),
                    learning_rate=0.1, n_epochs=200,
                    nkerns=[20, 50, 50],
                    batch_size=400):
 
-        return train_cnn_example(X, Y, dataset=dataset, learning_rate=learning_rate, n_epochs=n_epochs, nkerns=nkerns,
-                                 batch_size=batch_size)
+        # return train_cnn_example(X, Y, dataset=dataset, learning_rate=learning_rate, n_epochs=n_epochs, nkerns=nkerns,
+        #                          batch_size=batch_size)
+
+        with gzip.open(dataset, 'rb') as f:
+            train_set, valid_set, test_set = cPickle.load(f)
+
+        train_set_x, train_set_y = shared_dataset(train_set)
+        valid_set_x, valid_set_y = shared_dataset(valid_set)
+        test_set_x, test_set_y = shared_dataset(test_set)
+
+        # compute number of minibatches for training, validation and testing
+        n_train_batches = train_set_x.get_value(borrow=True).shape[0]
+        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
+        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
+        n_train_batches /= batch_size
+        n_valid_batches /= batch_size
+        n_test_batches /= batch_size
+
+        print train_set_x.get_value(borrow=True).shape, train_set_y.get_value(borrow=True).shape
+
+        rng = np.random.RandomState(12306)
+        index = T.lscalar()  # index to a [mini]batch
+        # start-snippet-1
+        x = T.matrix('x')   # the data is presented as rasterized images
+        y = T.ivector('y')  # the labels are presented as 1D vector of
+                            # [int] labels
+
+        ######################
+        # BUILD ACTUAL MODEL #
+        ######################
+        print '... building the model'
+
+        layer0_input = x.reshape((batch_size, 1, 28, 28))
+
+        # Construct the first convolutional pooling layer:
+        # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
+        # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
+        # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
+        layer0 = ConvPoolLayer(
+            rng,
+            input=layer0_input,
+            image_shape=(batch_size, 1, 28, 28),
+            filter_shape=(nkerns[0], 1, 5, 5),
+            poolsize=(2, 2)
+        )
+
+        # Construct the second convolutional pooling layer
+        # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
+        # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
+        # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
+        layer1 = ConvPoolLayer(
+            rng,
+            input=layer0.output,
+            image_shape=(batch_size, nkerns[0], 12, 12),
+            filter_shape=(nkerns[1], nkerns[0], 5, 5),
+            poolsize=(2, 2)
+        )
+
+        # the HiddenLayer being fully-connected, it operates on 2D matrices of
+        # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
+        # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
+        # or (500, 50 * 4 * 4) = (500, 800) with the default values.
+        layer2_input = layer1.output.flatten(2)
+
+        # construct a fully-connected sigmoidal layer
+        layer2 = HiddenLayer(
+            rng,
+            input=layer2_input,
+            n_in=nkerns[1] * 4 * 4,
+            n_out=500,
+            activation=T.tanh
+        )
+
+        # classify the values of the fully-connected sigmoidal layer
+        layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
+
+        # the cost we minimize during training is the NLL of the model
+        cost = layer3.negative_log_likelihood(y)
+
+        # create a function to compute the mistakes that are made by the model
+        test_model = theano.function(
+            [index],
+            layer3.errors(y),
+            givens={
+                x: test_set_x[index * batch_size: (index + 1) * batch_size],
+                y: test_set_y[index * batch_size: (index + 1) * batch_size]
+            }
+        )
+
+        validate_model = theano.function(
+            [index],
+            layer3.errors(y),
+            givens={
+                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
+                y: valid_set_y[index * batch_size: (index + 1) * batch_size]
+            }
+        )
+
+        # create a list of all model parameters to be fit by gradient descent
+        params = layer3.params + layer2.params + layer1.params + layer0.params
+
+        # create a list of gradients for all model parameters
+        grads = T.grad(cost, params)
+
+        # train_model is a function that updates the model parameters by
+        # SGD Since this model has many parameters, it would be tedious to
+        # manually create an update rule for each model parameter. We thus
+        # create the updates list by automatically looping over all
+        # (params[i], grads[i]) pairs.
+        updates = [
+            (param_i, param_i - learning_rate * grad_i)
+            for param_i, grad_i in zip(params, grads)
+        ]
+
+        train_model = theano.function(
+            [index],
+            cost,
+            updates=updates,
+            givens={
+                x: train_set_x[index * batch_size: (index + 1) * batch_size],
+                y: train_set_y[index * batch_size: (index + 1) * batch_size]
+            }
+        )
+        # end-snippet-1
+
+        ###############
+        # TRAIN MODEL #
+        ###############
+        print '... training'
+        # early-stopping parameters
+        patience = 10000  # look as this many examples regardless
+        patience_increase = 2  # wait this much longer when a new best is
+                               # found
+        improvement_threshold = 0.995  # a relative improvement of this much is
+                                       # considered significant
+        validation_frequency = min(n_train_batches, patience / 2)
+                                      # go through this many
+                                      # minibatche before checking the network
+                                      # on the validation set; in this case we
+                                      # check every epoch
+
+        best_validation_loss = np.inf
+        best_iter = 0
+        test_score = 0.
+        start_time = time.clock()
+
+        epoch = 0
+        done_looping = False
+
+        while (epoch < n_epochs) and (not done_looping):
+            epoch = epoch + 1
+            for minibatch_index in xrange(n_train_batches):
+
+                iter = (epoch - 1) * n_train_batches + minibatch_index
+
+                if iter % 100 == 0:
+                    print 'training @ iter = ', iter
+                cost_ij = train_model(minibatch_index)
+
+                if (iter + 1) % validation_frequency == 0:
+
+                    # compute zero-one loss on validation set
+                    validation_losses = [validate_model(i) for i
+                                         in xrange(n_valid_batches)]
+                    this_validation_loss = np.mean(validation_losses)
+                    print('epoch %i, minibatch %i/%i, validation error %f %%' %
+                          (epoch, minibatch_index + 1, n_train_batches,
+                           this_validation_loss * 100.))
+
+                    # if we got the best validation score until now
+                    if this_validation_loss < best_validation_loss:
+
+                        #improve patience if loss improvement is good enough
+                        if this_validation_loss < best_validation_loss *  \
+                           improvement_threshold:
+                            patience = max(patience, iter * patience_increase)
+
+                        # save best validation score and iteration number
+                        best_validation_loss = this_validation_loss
+                        best_iter = iter
+
+                        # test it on the test set
+                        test_losses = [
+                            test_model(i)
+                            for i in xrange(n_test_batches)
+                        ]
+                        test_score = np.mean(test_losses)
+                        print(('     epoch %i, minibatch %i/%i, test error of '
+                               'best model %f %%') %
+                              (epoch, minibatch_index + 1, n_train_batches,
+                               test_score * 100.))
+
+                if patience <= iter:
+                    done_looping = True
+                    break
+
+        end_time = time.clock()
+        print('Optimization complete.')
+        print('Best validation score of %f %% obtained at iteration %i, '
+              'with test performance %f %%' %
+              (best_validation_loss * 100., best_iter + 1, test_score * 100.))
+        print >> sys.stderr, ('The code for file ' +
+                              os.path.split(__file__)[1] +
+                              ' ran for %.2fm' % ((end_time - start_time) / 60.))
+
+
+
+
+
+
+
+
 
 
     def train(self, X, Y):
diff --git a/mmodel/theano/theanoutil.py b/mmodel/theano/theanoutil.py
index 8341aac..17ed51e 100644
--- a/mmodel/theano/theanoutil.py
+++ b/mmodel/theano/theanoutil.py
@@ -168,7 +168,7 @@ class ConvPoolLayer(object):
         self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
         self.params = [self.W, self.b]
 
-def _shared_dataset(data_xy, borrow=True):
+def shared_dataset(data_xy, borrow=True):
     """ Function that loads the dataset into shared variables
 
     The reason we store our dataset in shared variables is to allow
@@ -208,8 +208,8 @@ def train_cnn_example(X=None, Y=None, dataset=os.path.join('', '../../res/', 'il
     else:
         X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=0)
 
-    X_train, Y_train = _shared_dataset((X_train, Y_train))
-    X_test, Y_test = _shared_dataset((X_test, Y_test))
+    X_train, Y_train = shared_dataset((X_train, Y_train))
+    X_test, Y_test = shared_dataset((X_test, Y_test))
 
     # X_train = theano.shared(np.asarray(X_train, dtype=theano.config.floatX), borrow=True)
     # Y_train = theano.shared(np.asarray(Y_train, dtype=theano.config.floatX), borrow=True)
diff --git a/test/test_model.py b/test/test_model.py
index afb7bea..1d633a2 100755
--- a/test/test_model.py
+++ b/test/test_model.py
@@ -149,6 +149,11 @@ def test_SVM_ILSVRC_S():
     # test_SVM_ILSVRC_SPARK()
 
 
+def test_THEANO_mnist():
+    mtheano = THEANO.ModelTHEANO(toolset='cnn')
+    mtheano._train_cnn(learning_rate=0.1, n_epochs=200, dataset=os.path.join(package_dir, '../res/', 'mnist.pkl.gz'), nkerns=[20, 50], batch_size=500)
+
+
 def test_THEANO_crop():
     timer.mark()
     dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_crop_pil')
--
libgit2 0.21.2