Monday, 15 March 2010

scipy - Random forest in python -



scipy - Random forest in python -

i have run random forest model in python , able see classification table. hoping comprehensive code covering aspect starting codes info prep, model run, model validation, , accuracy check in python? getting lot of false positive in model. help improve helpful.

please see,

import urllib2 import numpy sklearn import tree sklearn.tree import decisiontreeclassifier sklearn.metrics import accuracy_score import random math import sqrt import matplotlib.pyplot plot # define function confusion matrix def confusionmatrix(predicted, actual, threshold): if len(predicted) != len(actual): homecoming -1 tp = 0.0 fp = 0.0 tn = 0.0 fn = 0.0 in range(len(actual)): if actual[i] > 0.5: #labels 1.0 (positive examples) if predicted[i] > threshold: tp += 1.0 #correctly predicted positive else: fn += 1.0 #incorrectly predicted negative else: #labels 0.0 (negative examples) if predicted[i] < threshold: tn += 1.0 #correctly predicted negative else: fp += 1.0 #incorrectly predicted positive rtn = [tp, fn, fp, tn] homecoming rtn #hyperlink python target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra") info = urllib2.urlopen(target_url) xlist = [] labels = [] names = [] firstline = true line in data: #row strip "," sign row = line.strip().split(",") # assign labels lastly column labels.append(float(row[-1])) #remove label row row.pop() #feature vector floatrow = [float(num) num in row] #append on xlist xlist.append(floatrow) nrows = len(xlist) ncols = len(xlist[0]) #split info test , train random.seed(1) nsample = int(nrows * 0.30) idxtest = random.sample(range(nrows),nsample) idxtest.sort() idxtrain = [idx idx in range(nrows) if not(idx in idxtest)] xtrain = [xlist[r] r in idxtrain] xtest = [xlist[r] r in idxtest] ytrain = [labels[r] r in idxtrain] ytest = [labels[r] r in idxtest] numtreesmax = 30 treedepth = 12 nattr = 4 modellist = [] indexlist = [] predlist = [] ntrainrows = len(ytrain) itrees in range(numtreesmax): idxattr = random.sample(range(ncols), nattr) idxattr.sort() indexlist.append(idxattr) idxrows = [] in range(int(0.5 * ntrainrows)): idxrows.append(random.choice(range(len(xtrain)))) idxrows.sort() xrftrain = [] yrftrain = [] in range(len(idxrows)): temp = [xtrain[idxrows[i]][j] j in idxattr] xrftrain.append(temp) yrftrain.append(ytrain[idxrows[i]]) modellist.append(decisiontreeclassifier(max_depth = treedepth)) modellist[-1].fit(xrftrain,yrftrain) xrftest = [] xx in xtest: temp = [xx[i] in idxattr] xrftest.append(temp) latestoutsampleprediction = modellist[-1].predict(xrftest) predlist.append(list(latestoutsampleprediction)) classerror = [] allpredictions = [] imodels in range(len(modellist)): prediction = [] ipred in range(len(xtest)): prediction.append(sum([predlist[i][ipred] in range(imodels +1)])/(imodels +1)) allpredictions.append(prediction) conmattest = confusionmatrix(prediction,ytest,0.5) errors = 1.0 - ((conmattest[0] + conmattest[3])/(conmattest[0]+conmattest[1]+conmattest[2]+conmattest[3])) classerror.append(errors) nmodels = [i + 1 in range(len(modellist))] plot.plot(nmodels,classerror) plot.axis('tight') plot.xlabel('number of trees in ensamble') plot.ylabel('class error') plot.ylim((0.0,max(classerror))) plot.show()

python-2.7 scipy random-forest

No comments:

Post a Comment