部分程式碼4
阿新 • • 發佈:2018-11-05
#!/usr/bin/env python
#-- coding:utf-8 --
#author: Enoch time:2018/10/30 0030
import re
import time
from collections import Counter
###################################################################################
#Name:count_words
#Inputs:file name,the first n words, stopfile name
#outputs:None
#Author: Thomas
#Date:2018.10.22
###################################################################################
def CountVerbPre(file_name,verbName,preName):
dicNum = {}
totalNum = 0
t0 = time.clock() with open(file_name) as f: txt = f.read() txt = txt.lower() txt = re.sub(r'\s+',' ',txt) pword = r'(([a-z]+ )+[a-z]+)' # extract sentence pattern = re.compile(pword) sentence = pattern.findall(txt) txt = ','.join([sentence[m][0] for m in range(len(sentence))]) pattern = "[a-z]+[0-9]*" for i in range(1): pattern += "[\s|,][a-z]+[0-9]*" wordList = [] for i in range(2): if( i == 0 ): tempList = re.findall(pattern, txt) else: wordpattern = "[a-z]+[0-9]*" txt = re.sub(wordpattern, '', txt, 1).strip() tempList = re.findall(pattern, txt) wordList += tempList tempc = Counter(wordList) with open(preName) as f: preTxt = f.read() preList = preTxt.split('\n') verbDic = {} with open(verbName) as f: for line in f.readlines(): key,value = line.split(' -> ') for tverb in value.replace('\n','').split(','): verbDic[tverb] = key verbDic[key] = key for phrase in tempc.keys(): if(',' not in phrase): totalNum += 1 verb, pre = phrase.split(' ') if (verb in verbDic.keys() and pre in preList): normPhrase = verbDic[verb] + ' ' + pre if (normPhrase in dicNum.keys()): dicNum[normPhrase] += tempc[phrase] else: dicNum[normPhrase] = tempc[phrase] dicNum = sorted(dicNum.items(), key=lambda k: k[0]) dicNum = sorted(dicNum, key=lambda k: k[1], reverse=True) t1 = time.clock() for letter, fre in dicNum[:2]: print("|\t{:15}|{:<11.2%}|".format(letter, fre / totalNum)) print(t1 - t0)
CountVerbPre(’…/gone_with_the_wind.txt’,’…/Verbs.txt’,’…/prepositions.txt’)