統計一篇英文文章中出現次數最多的10個單詞
阿新 • • 發佈:2019-02-08
package se; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; public class damn { public static void main(String[] args) throws IOException { String str2 = System.getProperty("java.io.tmpdir"); System.out.println(str2); long start = System.currentTimeMillis(); // 程式開始時間 File file = new File("C:/Users/Wll/Desktop/Computer.txt"); BufferedReader br = new BufferedReader(new FileReader(file)); StringBuilder sb = new StringBuilder(); String line = null; while ((line = br.readLine()) != null) { sb.append(line); } br.close(); // 關閉流 String words = sb.toString(); // 全部的單詞字串 String targetString = words.replaceAll("[.,\"\\?!:;\\(\\)]", ""); // 將標點替換為空 // 分詞並且定義英文中不代表實際意義的一些單詞,如介詞、代詞、情態動詞等 String[] singleWord = targetString.split(" "); String[] keys = { "you", "i", "he", "she", "me", "him", "her", "it", "they", "them", "we", "us", "your", "yours", "our", "his", "her", "its", "my", "in", "into", "on", "for", "out", "up", "down", "at", "to", "too", "with", "by", "about", "among", "between", "over", "from", "be", "been", "am", "is", "are", "was", "were", "whthout", "the", "of", "and", "a", "an", "that", "this", "be", "or", "as", "will", "would", "can", "could", "may", "might", "shall", "should", "must", "has", "have", "had", "than" }; // 將一部分常見的無意義的英語單詞替換為字元 '#' 以便後面輸出單詞出現次數時的判斷 for (int i = 0; i < singleWord.length; i++) { for (String str : keys) { if (singleWord[i].equals(str)) singleWord[i] = "#"; } } // 將單詞以及其出現的次數關聯起來 for (int i = 0; i < singleWord.length; i++) { count++; // 計算單詞個數 if ((wordMap.get(singleWord[i]) != null)) { int value = ((Integer) wordMap.get(singleWord[i])).intValue(); value++; wordMap.put(singleWord[i].toLowerCase(), new Integer(value)); // 將單詞轉換為小寫存放以統一格式 } else { wordMap.put(singleWord[i].toLowerCase(), new Integer(1)); } } System.out.println("\t\t--檔案資訊--"); System.out.println(" 名稱: " + file.getName() + " 大小: " + file.length() / 1024 + " KB"); System.out.println("\t\t--檔案資訊--"); System.out.println(); System.out.println("■■■■ " + count + " 個單詞中出現頻率最高的 10 個單詞如下■■■■"); // 比較器, 按值排序 System.setProperty("java.util.Arrays.useLegacyMergeSort", "true"); List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>( wordMap.entrySet()); Collections.sort(list, new Comparator<Entry<String, Integer>>() { public int compare(Entry<String, Integer> e1, Entry<String, Integer> e2) { if (e2.getValue() != null && e1.getValue() != null && e2.getValue().compareTo(e1.getValue()) > 0) { return 1; } else { return -1; } } }); int wordCount = 1; // 記錄已經輸出單詞的個數 for (Map.Entry<String, Integer> entry : list) { if (entry.getKey().equals("#")) // 相當於過濾作用,不輸出介詞、代詞、情態動詞等無意義單詞 continue; System.out.printf("\t%2d、 %8s \t %4d次\n", wordCount, entry.getKey(), entry.getValue()); if (wordCount++ == 10) { // 表示只輸出10個 long end = System.currentTimeMillis(); // 程式結束時間 System.out.println("■■■■■■■■■■■■■■■ 耗時 " + (end - start) + " ms" + " ■■■■■■■■■■■■■■■■"); return; } } } private static HashMap<String, Integer> wordMap = new HashMap<String, Integer>(); private static int count = 0; }
程式執行情況如下:
總的來說,這個程式自己覺得還是完成得比較好,而且從中也學到了很多。比如正則表示式之前沒怎麼接觸過,這次就學習了許多正則表示式相關的知識。另外,也進一步熟悉了HashMap類和ArrayList類。除此之外,還學到了一些編寫程式的方法與技巧,使得程式碼條理更加清晰。