階段作業1:完整的中英文詞頻統計+補交上次作業
阿新 • • 發佈:2018-11-12
#補交作業
cc = ('''Counting stars Lately I've been, I've been losing sleep Dreaming 'bout the things that we could be But baby I've been, I've been prayin' hard Said no more counting dollars We'll be counting stars Yeah, we'll be counting stars I see this life Like a swinging vine Swing my heart across the line In my face is flashing signs Seek it out and ye shall find Old, but I'm not that old Young, but I'm not that bold And I don't think the world is sold I'm just doing what we're told I, feel something so right But doing the wrong thing I, feel something so wrong But doing the right thing I could lie, could lie, could lie everything that kills me makes me feel alive Lately I've been, I've been losing sleep Dreaming 'bout the things that we could be Baby I've been, I've been prayin' hard Said no more counting dollars We'll be counting stars Lately I've been, I've been losing sleep Dreaming 'bout the things that we could be Baby I've been, I've been prayin' hard Said no more counting dollars We'll be, we'll be counting stars I feel the love And I feel it burn Down this river every turn Hope is a four letter word Make that money Watch it burn Old, but I'm not that old Young, but I'm not that bold And I don't think the world is sold I'm just doing what we're told I, feel something so wrong But doing the right thing I could lie, could lie, could lie Everything that drowns me makes me wanna fly Lately I've been, I've been losing sleep Dreaming 'bout the things that we could be Baby I've been, I've been prayin' hard Said no more counting dollars We'll be counting stars Lately I've been, I've been losing sleep Dreaming 'bout the things that we could be Baby I've been, I've been prayin' hard Said no more counting dollars We'll be, we'll be counting stars Take that money And watch it burn Sink in the river ''') cc = cc.replace('.', ' ') ccList = cc.split() print(len(cc), ccList) # 分隔一個單詞並統計英文單詞個數 ccSet = set(ccList) # 將列表轉化成集合,再將集合轉化成字典來統計每個單詞出現個數 print(ccSet) strDict = {} # for star in ccSet: # strDict[star] = ccList.count(star) # print(strDict, len(strDict)) for star in ccSet: strDict[star]=cc.count(star) for key in ccSet: print(key,strDict[key]) wclist=list(ccSet.items()) print(wclist) # def takeSecond(elem): # return elem[1] # wclist.sort(key=takeSecond,reverse=True) # print(wclist) #按詞頻排序 wcList=list(strDict.items()) print(wcList) wcList.sort(key=lambda x:x[1],reverse=True) print(wcList) #輸出TOP(20) for i in range(20): print(wcList[i]) # 列表的遍歷 cclist = ['wqdq', 'dqd', 'Awd', 313, '小四', 'dqd'] print(cclist) cclist.append('gegeheh') print(cclist) cclist.pop(2) print(cclist) for i in cclist: print(i) # 元組的遍歷 tuple = ('jtfjhrr', 'rqfw f2q', 800, 10) print(tuple[2]) for i in tuple: print(i) # 字典的遍歷 dic = {'fhehe': '4w6436', 'jgdns': 7, '4w6436': 'First'} print('fhehe:', dic['fhehe']) print('4w6436:', dic['4w6436']) dic['4w6436'] = 8; dic['4w6436'] = "對接歐文機房的維護" print('4w6436:', dic['4w6436']) print('4w6436:', dic['4w6436']) for key in dic: print(key, ':', dic.get(key)) # 集合的遍歷 a = set([1, 2, 3, 6, 5]) print(a) a.add(4) print(a) a.add('uteru') print(a) a.remove(5) print(a) for i in a: print(i)
#此次作業
fo=open('ccc1015.txt','r',encoding='utf-8') strBig=fo.read().lower() fo.close() print(strBig) #字串預處理:#大小寫,標點符號,特殊符號 sep=""".,:;!?""" for ch in sep: strBig=strBig.replace(ch,'') strlist=strBig.split() print(len(strlist),strlist) strSet=set(strlist) exclude={'is','be','be','I','we','the','in'} strSet=strSet-exclude print(len(strSet),strSet) strDict={} for word in strSet: strDict[word]=strlist.count(word) print(len(strDict),strDict) #按詞頻排序 wcList=list(strDict.items()) print(wcList) wcList.sort(key=lambda x:x[1],reverse=True) print(wcList) #輸出TOP(20) for i in range(20): print(wcList[i]) # 中文版 #讀取文字檔案 f = open('shengxu.txt','r',encoding='utf-8') story = f.read() f.close() print(story) #預處理 sep = ',。:“”?!''' #符號處理 for ch in sep: story=story.replace(ch,' ') #利用for迴圈語句把特殊符號替換成空格 print(story) #中文分詞:結巴 import jieba cnStr = story #精確模式 print(list(jieba.cut(cnStr))) # 分隔提取單詞 strList = story.split(' ') print(len(strList), strList) # 單詞計數字典 strSet = set(strList) print(len(strSet), strSet) strDict = {} for word in strSet: strDict[word] = strList.count(word) # print(len(strDict),strDict) # 詞頻排序 wcList = list(strDict.items()) # print(wcList) wcList.sort(key=lambda x: x[1], reverse=True) # print(wcList) # 輸出TOP10 for i in range(10): print(wcList[i])