Python讀書筆記009:文字統計
阿新 • • 發佈:2019-01-22
文字檔案的統計資料:
>>> len(s) 46 >>> s.split() ['A', 'long', 'time', 'ago,', 'in', 'a', 'galaxy', 'far,', 'far', 'away...'] >>> t = ' a long time ago in a galaxy far far away' >>> t.split() ['a', 'long', 'time', 'ago', 'in', 'a', 'galaxy', 'far', 'far', 'away'] >>> len(t.split()) 10 >>> set(t.split()) {'in', 'away', 'ago', 'far', 'a', 'galaxy', 'time', 'long'} >>> len(set(t.split())) 8
保留想要的字母
將字串轉換成小寫:
>>> s = "I'd like a copy!"
>>> s.lower()
"i'd like a copy!"
刪除不想要的字元:
>>> s = "I'd like a copy!"
>>> s.replace('!','')
"I'd like a copy"
>>> s.replace("'",'')
'Id like a copy!'
>>> s.replace("'",' ')
'I d like a copy!'
keep = {'a', 'b', 'c', 'd', 'e', 'f', \ 'g', 'h', 'i', 'j', 'k', 'l', \ 'm', 'n', 'o', 'p', 'q', 'r', \ 's', 't', 'u', 'v', 'w', 'x', \ 'y', 'z', ' ', '-', "'"} def normalize(s): ''' Convert s to a normatlized string ''' result = '' for c in s.lower(): if c in keep: result +=c return result
>>> s = "I'd like a copy!"
>>> normalize(s)
"i'd like a copy"
文字統計:
keep = {'a', 'b', 'c', 'd', 'e', 'f', \ 'g', 'h', 'i', 'j', 'k', 'l', \ 'm', 'n', 'o', 'p', 'q', 'r', \ 's', 't', 'u', 'v', 'w', 'x', \ 'y', 'z', ' ', '-', "'"} def normalize(s): ''' Convert s to a normatlized string ''' result = '' for c in s.lower(): if c in keep: result +=c return result def make_freq_dict(s): ''' Returns a dictionary whose keys are the words of s, and whose value are the counts of those words. ''' s = normalize(s) words = s.split() d = {} for w in words: if w in d: d[w] +=1 else: d[w] =1 return d def print_file_stats(fname): ''' Print statistics for the given file. ''' s = open(fname,'r').read() num_chars = len(s) num_lines = s.count('\n') d = make_freq_dict(s) num_words = sum(d[w] for w in d) lst = [(d[w],w) for w in d] lst.sort() lst.reverse() print("The file '%s' has" % frame) print(" %s characters" % num_chars) print(" %s lines" % num_lines) print(" %s words" % num_words) print("\nThe top 10 most frequent words are:") i=1 for count, word in lst[:10]: print('%2s. %2s %s' %(i, count, word)) i +=1
>>> frame="e://Python//The Babes.txt"
>>> print_file_stats(frame)
The file 'e://Python//The Babes.txt' has
148319 characters
3118 lines
23817 words
The top 10 most frequent words are:
1. 1253 the
2. 746 and
3. 675 to
4. 657 of
5. 496 her
6. 436 a
7. 383 in
8. 352 she
9. 261 you
10. 259 daph