部分程式碼3
阿新 • • 發佈:2018-11-05
#!/usr/bin/env python #-*- coding:utf-8 -*- #author: Enoch time:2018/10/30 0030 import re import time from collections import Counter import os import sys import cProfile ################################################################################### #Name:count_words #Inputs:file name,the first n words, stopfile name #outputs:None #Author: Thomas #Date:2018.10.22 ################################################################################### def CountPhrases(file_name,k): totalNum = 0 t0 = time.clock() with open(file_name) as f: txt = f.read() txt = txt.lower() txt = re.sub(r'\s+',' ',txt) pword = r'(([a-z]+ )+[a-z]+)' # extract sentence pattern = re.compile(pword) sentence = pattern.findall(txt) txt = ','.join([sentence[m][0] for m in range(len(sentence))]) pattern = "[a-z]+[0-9]*" for i in range(k-1): pattern += "[\s|,][a-z]+[0-9]*" wordList = [] for i in range(k): if( i == 0 ): tempList = re.findall(pattern, txt) else: wordpattern = "[a-z]+[0-9]*" txt = re.sub(wordpattern, '', txt, 1).strip() tempList = re.findall(pattern, txt) wordList += tempList tempc = Counter(wordList) dicNum = {} phrases = tempc.keys() for phrase in phrases: if (',' not in phrase): dicNum[phrase] = tempc[phrase] totalNum += tempc[phrase] dicNum = sorted(dicNum.items(), key=lambda k: k[0]) dicNum = sorted(dicNum, key=lambda k: k[1], reverse=True) t1 = time.clock() for letter, fre in dicNum[:2]: print("|\t{:15}|{:<11.2%}|".format(letter, fre / totalNum)) print(t1 - t0) CountPhrases('../gone_with_the_wind.txt', 2)