1. 程式人生 > >分治法實現分類的python實現

分治法實現分類的python實現

 
def makeDataSet(fileName):
           """Reads a training set from the specified file."""
           tSet=[]
           
           #open file. Fix the error checking
           fileDescriptor=open(fileName)

           for fLine in fileDescriptor:
                      fLine=fLine.strip()  #strip off end-of-line character "\n"
                      #if a '?' in the patient data, skip that patient
                      if '?' in fLine:
                                 continue
                      
                      id,a1,a2,a3,a4,a5,a6,a7,a8,a9,diag=fLine.split(',')
                      if diag=='4':   #diagnosis is 'malignant'
                                 diagMorB='m'
                      else:      #diagnosis is 'benign'
                                 diagMorB='b'
                      patientTuple=(id,diagMorB,int(a1),int(a2),int(a3),int(a4),\
                                    int(a5),int(a6),int(a7),int(a8),int(a9))
                      tSet.append(patientTuple)
           return tSet
           
def sumLists(list1,list2):
           """Element-by-element sums of two lists of 9 items."""
           sumList=[0.0]*9
           for index in range(0,9):
                      sumList[index]=list1[index]+list2[index]
           return sumList

def makeAverages(sumList,total):
           """Convert each list element into an average by dividing by the total."""
           averageList=[0.0]*9
           for index in range(9):
                      averageList[index]=sumList[index]/float(total)

           return averageList

def trainClassifier(trainingSet):
           """Build a classifier using the training set."""
           benignSums=[0]*9
           benignCount=0
           malignantSums=[0]*9
           malignantCount=0

           for patientTup in trainingSet:
                      if patientTup[1]=='b':
                                 benignSums=sumLists(benignSums,patientTup[2:])
                                 benignCount+=1
                      else:
                                 malignantSums=sumLists(malignantSums,patientTup[2:])
                                 malignantCount+=1
                                 
           benignAvgs=makeAverages(benignSums,benignCount)
           malignantAvgs=makeAverages(malignantSums,malignantCount)

           classifier=makeAverages(sumLists(benignAvgs,malignantAvgs),float(2))

           return classifier


def classifyTestSet(testSet,classifier):
           """Run classifier on the test set."""
           results=[]
           #for each patient
           for patient in testSet:
                      benignCount=0
                      malignantCount=0
                      #for each attribute of the patient
                      for index in range(0,9):
                                 #if actual patient attributes is greater than separator value
                                 if patient[index+2]>classifier[index]:
                                            malignantCount+=1
                                 else:
                                            benignCount+=1
                      #record patient id,both counts,and actual diganosis
                      resultTuple=(patient[0],benignCount,malignantCount,patient[1])

                      #add patient to list of results
                      results.append(resultTuple)
           return results

def reportResults(results):
           """Determine accuracy of classifier and report."""
           totalCount=0
           inaccurateCount=0

           for r in results:
                      totalCount+=1
                      #if benignCount>malignantCount, we should predict 'b'
                      if r[1]>r[2]:
                                 if r[3]=='m':  #we are wrong...
                                            inaccurateCount+=1
                      #if malignantCount>benignCount,we should predict 'm'
                      elif r[3]=='b':
                                 inaccurateCount+=1
           print("of %d" %totalCount,"patients,there were %d"%inaccurateCount,'inaccuracies')
           
def main():
           print("reading in training data")
           trainingFile="breast-cancer-wisconsin.data"
           trainingSet=makeDataSet(trainingFile)
           print('Done reading training data.')

           print('Training classifier...')
           classifier=trainClassifier(trainingSet)
           print("Done training classifier.")

           print("The value of classifier:")
           print(classifier)
           

           print("Reading in test data...")
           testFile="breast-cancer-wisconsin.data"
           testSet=makeDataSet(testFile)
           print('Done reading test data.')

           print('Classifying records...')
           resultList=classifyTestSet(testSet,classifier)
           print('Done classifying.')

           reportResults(resultList)

           print('Program finished.')


執行指令和結果如下: