lucene 統計單詞次數(詞頻)並進行排序
阿新 • • 發佈:2018-09-24
edm font tin total .html lan 技術 rms puts
1 public class WordCount {
2 static Directory directory;
3 // 創建分詞器
4 static Analyzer analyzer = new IKAnalyzer();
5 static IndexWriterConfig config = new IndexWriterConfig(analyzer);
6 static IndexWriter writer;
7 static IndexReader reader;
8 static {
9 // 指定索引存放目錄以及配置參數
10 try {
11 directory = FSDirectory.open(Paths.get("F:/luceneIndex"));
12 writer = new IndexWriter(directory, config);
13 } catch (IOException e) {
14 e.printStackTrace();
15 }
16 }
17
18 public static void main(String[] args) {
19 indexCreate();
20 Map<String, Long> map = getTotalFreqMap();
21 Map<String, Long> sortMap = sortMapByValue(map);
22 Set<Entry<String, Long>> entrySet = sortMap.entrySet();
23 Iterator<Entry<String, Long>> iterator = entrySet.iterator();
24 while (iterator.hasNext()) {
25 Entry<String, Long> entry = iterator.next();
26 System.out.println(entry.getKey() + "----" + entry.getValue());
27 }
28
29 }
30
31 /**
32 * 創建索引
33 */
34 public static void indexCreate() {
35 // 文件夾檢測(創建索引前要保證目錄是空的)
36 File file = new File("f:/luceneIndex");
37 if (!file.exists()) {
38 file.mkdirs();
39 } else {
40 try {
41 file.delete();
42 } catch (Exception e) {
43 e.printStackTrace();
44 }
45 }
46
47 // 將采集的數據封裝到Document中
48 Document doc = new Document();
49 FieldType ft = new FieldType();
50 ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
51 ft.setStored(true);
52 ft.setStoreTermVectors(true);
53 ft.setTokenized(true);
54 // ft.setStoreTermVectorOffsets(true);
55 // ft.setStoreTermVectorPositions(true);
56
57 // 讀取文件內容(小文件,readFully)
58 File content = new File("f:/qz/twitter.txt");
59 try {
60 byte[] buffer = new byte[(int) content.length()];
61 IOUtils.readFully(new FileInputStream(content), buffer);
62 doc.add(new Field("twitter", new String(buffer), ft));
63 } catch (Exception e) {
64 e.printStackTrace();
65 }
66
67 // 生成索引
68 try {
69 writer.addDocument(doc);
70 // 關閉
71 writer.close();
72
73 } catch (IOException e) {
74 e.printStackTrace();
75 }
76 }
77
78 /**
79 * 獲得詞頻map
80 *
81 * @throws ParseException
82 */
83 public static Map<String, Long> getTotalFreqMap() {
84 Map<String, Long> map = new HashMap<String, Long>();
85 try {
86 reader = DirectoryReader.open(directory);
87 List<LeafReaderContext> leaves = reader.leaves();
88 for (LeafReaderContext leafReaderContext : leaves) {
89 LeafReader leafReader = leafReaderContext.reader();
90
91 Terms terms = leafReader.terms("twitter");
92
93 TermsEnum iterator = terms.iterator();
94
95 BytesRef term = null;
96
97 while ((term = iterator.next()) != null) {
98 String text = term.utf8ToString();
99 map.put(text, iterator.totalTermFreq());
100 }
101
102 }
103 reader.close();
104 return map;
105 } catch (IOException e) {
106 e.printStackTrace();
107 }
108 return null;
109 }
110
111 /**
112 * 使用 Map按value進行排序
113 *
114 * @param map
115 * @return
116 */
117 public static Map<String, Long> sortMapByValue(Map<String, Long> oriMap) {
118 if (oriMap == null || oriMap.isEmpty()) {
119 return null;
120 }
121 Map<String, Long> sortedMap = new LinkedHashMap<String, Long>();
122
123 List<Map.Entry<String, Long>> entryList = new ArrayList<Map.Entry<String, Long>>(oriMap.entrySet());
124 Collections.sort(entryList, new MapValueComparator());
125
126 Iterator<Map.Entry<String, Long>> iter = entryList.iterator();
127 Map.Entry<String, Long> tmpEntry = null;
128 while (iter.hasNext()) {
129 tmpEntry = iter.next();
130 sortedMap.put(tmpEntry.getKey(), tmpEntry.getValue());
131 }
132 return sortedMap;
133 }
134 }
135
136 class MapValueComparator implements Comparator<Map.Entry<String, Long>> {
137
138 @Override
139 public int compare(Entry<String, Long> me1, Entry<String, Long> me2) {
140 if (me1.getValue() == me2.getValue()) {
141 return 0;
142 }
143 return me1.getValue() > me2.getValue() ? -1 : 1;
144 // return me1.getValue().compareTo(me2.getValue());
145 }
146 }
map排序代碼https://www.cnblogs.com/zhujiabin/p/6164826.html
lucene 統計單詞次數(詞頻)並進行排序