My Blog: scipy - Random forest in python -

Monday, 15 March 2010

scipy - Random forest in python -

i have run random forest model in python , able see classification table. hoping comprehensive code covering aspect starting codes info prep, model run, model validation, , accuracy check in python? getting lot of false positive in model. help improve helpful.

please see,

import urllib2 import numpy sklearn import tree sklearn.tree import decisiontreeclassifier sklearn.metrics import accuracy_score import random math import sqrt import matplotlib.pyplot plot   # define function confusion matrix def confusionmatrix(predicted, actual, threshold):     if len(predicted) != len(actual):  homecoming -1     tp = 0.0     fp = 0.0     tn = 0.0     fn = 0.0     in range(len(actual)):         if actual[i] > 0.5: #labels 1.0  (positive examples)             if predicted[i] > threshold:                 tp += 1.0 #correctly predicted positive             else:                 fn += 1.0 #incorrectly predicted negative         else:              #labels 0.0 (negative examples)             if predicted[i] < threshold:                 tn += 1.0 #correctly predicted negative             else:                 fp += 1.0 #incorrectly predicted positive     rtn = [tp, fn, fp, tn]      homecoming rtn    #hyperlink python target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra")   info = urllib2.urlopen(target_url)  xlist = [] labels = [] names = [] firstline = true  line in data:     #row strip "," sign     row = line.strip().split(",")     # assign labels  lastly column     labels.append(float(row[-1]))     #remove label row     row.pop()     #feature vector     floatrow = [float(num) num in row]     #append on xlist     xlist.append(floatrow)   nrows = len(xlist) ncols = len(xlist[0])  #split   info test , train random.seed(1) nsample = int(nrows * 0.30) idxtest = random.sample(range(nrows),nsample) idxtest.sort() idxtrain = [idx idx in range(nrows) if not(idx in idxtest)]  xtrain = [xlist[r] r in idxtrain] xtest = [xlist[r] r in idxtest] ytrain = [labels[r] r in idxtrain] ytest = [labels[r] r in idxtest]    numtreesmax = 30  treedepth = 12  nattr = 4  modellist = [] indexlist = [] predlist = [] ntrainrows = len(ytrain)   itrees in range(numtreesmax):          idxattr = random.sample(range(ncols), nattr)         idxattr.sort()         indexlist.append(idxattr)          idxrows = []         in range(int(0.5 * ntrainrows)):                 idxrows.append(random.choice(range(len(xtrain))))         idxrows.sort()          xrftrain = []         yrftrain = []           in range(len(idxrows)):                 temp = [xtrain[idxrows[i]][j] j in idxattr]                 xrftrain.append(temp)                 yrftrain.append(ytrain[idxrows[i]])          modellist.append(decisiontreeclassifier(max_depth = treedepth))          modellist[-1].fit(xrftrain,yrftrain)          xrftest = []         xx in xtest:                 temp = [xx[i] in idxattr]                 xrftest.append(temp)          latestoutsampleprediction = modellist[-1].predict(xrftest)         predlist.append(list(latestoutsampleprediction))    classerror = [] allpredictions = [] imodels in range(len(modellist)):         prediction = []         ipred in range(len(xtest)):                 prediction.append(sum([predlist[i][ipred] in range(imodels +1)])/(imodels +1))          allpredictions.append(prediction)         conmattest = confusionmatrix(prediction,ytest,0.5)         errors = 1.0 - ((conmattest[0] + conmattest[3])/(conmattest[0]+conmattest[1]+conmattest[2]+conmattest[3]))         classerror.append(errors)      nmodels = [i + 1 in range(len(modellist))]  plot.plot(nmodels,classerror) plot.axis('tight') plot.xlabel('number of trees in ensamble') plot.ylabel('class error') plot.ylim((0.0,max(classerror))) plot.show()

python-2.7 scipy random-forest

My Blog

Monday, 15 March 2010

scipy - Random forest in python -

No comments:

Post a Comment