Skip to content

Commit 28e7b96

Browse files
committed
added benchmark scripts
1 parent 7d38b35 commit 28e7b96

File tree

6 files changed

+83
-39
lines changed

6 files changed

+83
-39
lines changed

scripts/benchmarks/skkmeans.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import time
2+
3+
import numpy as np
4+
import scipy.io as sio
5+
import h5py
6+
from sklearn.datasets import load_svmlight_file
7+
from sklearn.cluster import KMeans
8+
9+
f = h5py.File('/code/BIDMach/data/MNIST8M/all.mat','r')
10+
11+
t0 = time.time()
12+
data = f.get('/all') # Get a certain dataset
13+
X = np.array(data)
14+
t1 = time.time()
15+
16+
t_read = t1 - t0
17+
print("Finished reading in " + repr(t_read) + " secs")
18+
19+
batch_size = 10
20+
kmeans = KMeans(n_clusters=256, init='random', n_init=1, max_iter=10, tol=0.0001, precompute_distances=False, verbose=0, random_state=None, copy_x=False, n_jobs=1)
21+
kmeans.fit(X)
22+
t2 = time.time()
23+
t_batch = t2 - t1
24+
print("compute time " + repr(t_batch) + " secs")

scripts/benchmarks/sklogistic.py

100755100644
Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,44 +6,16 @@
66
#import pylab as pl
77

88
from sklearn.linear_model import SGDClassifier
9-
from sklearn.linear_model import LogisticRegression
10-
from sklearn.multiclass import OneVsRestClassifier
11-
from sklearn.metrics import roc_curve, auc
12-
from sklearn.cross_validation import train_test_split
9+
from sklearn.datasets.samples_generator import make_blobs
1310

14-
print("Start reading")
15-
XY=sio.loadmat("/code/BIDMach/data/rcv1/all2.mat")
16-
X=XY["data"].transpose()
17-
Y=XY["cats"].transpose()
18-
19-
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0)
20-
Y6=Y_train[:,6]
11+
XY=sio.loadmat("/data/rcv1/all.mat")
12+
X=XY["data"]
13+
Y=XY["cats"]
2114
print("Finished reading")
2215
batch_size = 10
16+
dim=256
17+
sgd_means = SGDClassifier(loss='log', alpha=0.01, fit_intercept=true, max_iter=3)
2318
t0 = time.time()
24-
sgd = OneVsRestClassifier(SGDClassifier(loss='log', verbose=0, alpha=1.0e-6, penalty='l1', n_jobs=1, n_iter=1))
25-
sgd.fit(X_train,Y_train)
26-
27-
t1 = time.time()
28-
sgd2=SGDClassifier(loss='log', verbose=0, alpha=0.01, fit_intercept=True, n_iter=1)
29-
sgd2.fit(X_train,Y6)
30-
t2 = time.time()
31-
#sgd=LogisticRegression(fit_intercept=True)
32-
#sgd.fit(X,Y6)
33-
34-
t_batch = t1 - t0
35-
Y_score = sgd.decision_function(X_test)
36-
fpr = dict()
37-
tpr = dict()
38-
roc_auc = np.zeros(100)
39-
for i in range(100):
40-
fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], Y_score[:, i])
41-
roc_auc[i] = auc(fpr[i], tpr[i])
42-
43-
Y6_score = sgd2.decision_function(X_test)
44-
fpr6, tpr6, _ = roc_curve(Y_test[:,6], Y6_score)
45-
auc6 =auc(fpr6,tpr6)
46-
47-
19+
sgd.fit(X,Y)
20+
t_batch = time.time() - t0
4821
print(t_batch)
49-
print(t2-t1)
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import org.apache.spark.SparkContext
2+
import org.apache.spark.mllib.classification.SVMWithSGD
3+
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
4+
import org.apache.spark.mllib.regression.LabeledPoint
5+
import org.apache.spark.mllib.linalg.Vectors
6+
import org.apache.spark.mllib.util.MLUtils
7+
import scala.compat.Platform._
8+
9+
val t0=currentTime
10+
// Load training data in LIBSVM format.
11+
val data = MLUtils.loadLibSVMFile(sc, "/big/RCV1/v2/train6.libsvm")
12+
val t1=currentTime
13+
14+
// Split data into training (90%) and test (10%).
15+
val splits = data.randomSplit(Array(0.9, 0.1), seed = 11L)
16+
val training = splits(0).cache()
17+
val test = splits(1)
18+
val t2=currentTime
19+
20+
// Run training algorithm to build the model
21+
val numIterations = 100
22+
val model = SVMWithSGD.train(training, numIterations)
23+
24+
val t3=currentTime
25+
26+
// Clear the default threshold.
27+
model.clearThreshold()
28+
29+
// Compute raw scores on the test set.
30+
val scoreAndLabels = test.map { point =>
31+
val score = model.predict(point.features)
32+
(score, point.label)
33+
}
34+
35+
val t4=currentTime
36+
37+
// Get evaluation metrics.
38+
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
39+
val auROC = metrics.areaUnderROC()
40+
println("Area under ROC = " + auROC)
41+
42+
println("load time %f, split %f, train %f, predict %f" format ((t1-t0)/1000f,
43+
(t2-t1)/1000f, (t3-t2)/1000f, (t4-t3)/1000f))

scripts/benchmarks/testVWkmeans.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/sh
2+
3+
time /code/vowpal_wabbit/vowpalwabbit/vw --oaa 103 -f rcv1.model --loss_function logistic -b 24 --adaptive --invariant -l 1 --cache_file vw.cache --passes 1 -d /big/RCV1/v2/vw_sparse_train.dat
4+
5+

scripts/testrforest.ssc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ opts.useGPU = true
2525
opts.trace = 0
2626
opts.batchSize = 20000
2727
opts.depth = 30
28-
opts.ntrees = 20
28+
opts.ntrees = 33
2929

3030
opts.nsamps = 32
3131
opts.nnodes = 500000

scripts/yearprediction.ssc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ val mm = new Learner( // make a predictor
3838

3939

4040
opts.useGPU = true
41-
opts.batchSize = 10000
41+
opts.batchSize = 20000
4242
opts.depth = 20
43-
opts.ntrees = 100
43+
opts.ntrees = 25
4444
opts.ncats = 90
4545

4646
opts.nsamps = 25

0 commit comments

Comments
 (0)