added benchmark scripts

jcanny · jcanny · commit 28e7b96470fb · 2015-02-18T07:31:35.000-08:00
diff --git a/scripts/benchmarks/skkmeans.py b/scripts/benchmarks/skkmeans.py
@@ -0,0 +1,24 @@
+import time
+
+import numpy as np
+import scipy.io as sio
+import h5py
+from sklearn.datasets import load_svmlight_file
+from sklearn.cluster import KMeans
+
+f = h5py.File('/code/BIDMach/data/MNIST8M/all.mat','r')
+
+t0 = time.time()
+data = f.get('/all') # Get a certain dataset
+X = np.array(data)
+t1 = time.time()
+
+t_read = t1 - t0
+print("Finished reading in " + repr(t_read) + " secs")
+
+batch_size = 10
+kmeans = KMeans(n_clusters=256, init='random', n_init=1, max_iter=10, tol=0.0001, precompute_distances=False, verbose=0, random_state=None, copy_x=False, n_jobs=1)
+kmeans.fit(X)
+t2 = time.time()
+t_batch = t2 - t1
+print("compute time " + repr(t_batch) + " secs")
diff --git a/scripts/benchmarks/sklogistic.py b/scripts/benchmarks/sklogistic.py
@@ -6,44 +6,16 @@
 #import pylab as pl
 
 from sklearn.linear_model import SGDClassifier
-from sklearn.linear_model import LogisticRegression
-from sklearn.multiclass import OneVsRestClassifier
-from sklearn.metrics import roc_curve, auc
-from sklearn.cross_validation import train_test_split
+from sklearn.datasets.samples_generator import make_blobs
 
-print("Start reading")
-XY=sio.loadmat("/code/BIDMach/data/rcv1/all2.mat")
-X=XY["data"].transpose()
-Y=XY["cats"].transpose()
-
-X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0)
-Y6=Y_train[:,6]
+XY=sio.loadmat("/data/rcv1/all.mat")
+X=XY["data"]
+Y=XY["cats"]
 print("Finished reading")
 batch_size = 10
+dim=256
+sgd_means = SGDClassifier(loss='log', alpha=0.01, fit_intercept=true, max_iter=3)
 t0 = time.time()
-sgd = OneVsRestClassifier(SGDClassifier(loss='log', verbose=0, alpha=1.0e-6, penalty='l1', n_jobs=1, n_iter=1))
-sgd.fit(X_train,Y_train)
-
-t1 = time.time()
-sgd2=SGDClassifier(loss='log', verbose=0, alpha=0.01, fit_intercept=True, n_iter=1)
-sgd2.fit(X_train,Y6)
-t2 = time.time()
-#sgd=LogisticRegression(fit_intercept=True)
-#sgd.fit(X,Y6)
-
-t_batch = t1 - t0
-Y_score = sgd.decision_function(X_test)
-fpr = dict()
-tpr = dict()
-roc_auc = np.zeros(100)
-for i in range(100):
-    fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], Y_score[:, i])
-    roc_auc[i] = auc(fpr[i], tpr[i])
-
-Y6_score = sgd2.decision_function(X_test)
-fpr6, tpr6, _ = roc_curve(Y_test[:,6], Y6_score)
-auc6 =auc(fpr6,tpr6)
-
-
+sgd.fit(X,Y)
+t_batch = time.time() - t0
 print(t_batch)
-print(t2-t1)
diff --git a/scripts/benchmarks/testSparkSVM.ssc b/scripts/benchmarks/testSparkSVM.ssc
@@ -0,0 +1,43 @@
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.classification.SVMWithSGD
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLUtils
+import scala.compat.Platform._ 
+
+val t0=currentTime
+// Load training data in LIBSVM format.
+val data = MLUtils.loadLibSVMFile(sc, "/big/RCV1/v2/train6.libsvm")
+val t1=currentTime
+
+// Split data into training (90%) and test (10%).
+val splits = data.randomSplit(Array(0.9, 0.1), seed = 11L)
+val training = splits(0).cache()
+val test = splits(1)
+val t2=currentTime
+
+// Run training algorithm to build the model
+val numIterations = 100
+val model = SVMWithSGD.train(training, numIterations)
+
+val t3=currentTime
+
+// Clear the default threshold.
+model.clearThreshold()
+
+// Compute raw scores on the test set. 
+val scoreAndLabels = test.map { point =>
+  val score = model.predict(point.features)
+  (score, point.label)
+}
+
+val t4=currentTime
+
+// Get evaluation metrics.
+val metrics = new BinaryClassificationMetrics(scoreAndLabels)
+val auROC = metrics.areaUnderROC()
+println("Area under ROC = " + auROC)
+
+println("load time %f, split %f, train %f, predict %f" format ((t1-t0)/1000f,
+(t2-t1)/1000f, (t3-t2)/1000f, (t4-t3)/1000f))
diff --git a/scripts/benchmarks/testVWkmeans.sh b/scripts/benchmarks/testVWkmeans.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+time /code/vowpal_wabbit/vowpalwabbit/vw --oaa 103 -f rcv1.model --loss_function logistic -b 24 --adaptive --invariant -l 1 --cache_file vw.cache --passes 1 -d /big/RCV1/v2/vw_sparse_train.dat
+
+
diff --git a/scripts/testrforest.ssc b/scripts/testrforest.ssc
@@ -25,7 +25,7 @@ opts.useGPU = true
 opts.trace = 0
 opts.batchSize = 20000
 opts.depth =  30
-opts.ntrees = 20
+opts.ntrees = 33
 
 opts.nsamps = 32
 opts.nnodes = 500000
diff --git a/scripts/yearprediction.ssc b/scripts/yearprediction.ssc
@@ -38,9 +38,9 @@ val mm = new Learner(             // make a predictor
 
 
 opts.useGPU = true
-opts.batchSize = 10000
+opts.batchSize = 20000
 opts.depth = 20
-opts.ntrees = 100
+opts.ntrees = 25
 opts.ncats = 90
 
 opts.nsamps = 25

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +#!/bin/sh
++
 +time /code/vowpal_wabbit/vowpalwabbit/vw --oaa 103 -f rcv1.model --loss_function logistic -b 24 --adaptive --invariant -l 1 --cache_file vw.cache --passes 1 -d /big/RCV1/v2/vw_sparse_train.dat
++
++