To be or not to be ...
阿新 • • 發佈:2018-12-22
/*
需求:為檔案建立倒排索引
step1:
|--為所有的檔案建立索引號 FileID_Number
|--首先查詢到所有的檔案目錄 file.list[]
|--將所有的檔案寫到一個檔案索引檔案中 fileIndex.txt
step2:
|--根據檔案的路徑將檔案載入到程式中,並將其中的單詞分詞統計
|--統計每個單詞在各個檔案中出現的頻率,並將統計資訊寫到結果檔案wordIndex.txt中
*/
import java.io.*;
import java.util.*;
class InvertedEngine
{
public static void main(String[] args) throws IOException
{
String filePath = "documents";
String docIndex = "docIndex.txt";
String wordIndex = "wordIndex.txt";
getFileIndex(filePath , docIndex);
getWordsFrequency(docIndex,wordIndex);
System.out.println("Work Done!" );
}
public static void getFileIndex(String filePath , String docIndex)
{
//通過傳入的filePath找到檔案所在,並將該檔案下所有檔案資訊寫到docIndex.txt中
File file = new File(filePath);
File[] fileList = file.listFiles();
BufferedWriter bufw = null;
try
{ //將所有filePath下的檔案路徑寫到docIndex檔案中
bufw = new BufferedWriter(new FileWriter(docIndex));
for(int x = 0 ; x <fileList.length ; x++ )
{
String docPath = fileList[x].getAbsolutePath();
bufw.write("DocID_" + x + "\t" + docPath);
bufw.newLine();bufw.flush();//重新整理寫入
}
}
catch (IOException e)
{
System.out.println("開啟檔案失敗" + e);
}
finally
{
try
{
if(bufw != null)
bufw.close();
}
catch (IOException ex)
{
System.out.println("關閉檔案失敗" + ex);
}
}
}
public static void getWordsFrequency(String docIndex , String wordIndex) throws IOException
{ //通過docIndex檔案中的內容找到每個檔案,並將檔案中的內容做單詞統計
TreeMap<String,TreeMap<String,Integer>> tmp = new TreeMap<String,TreeMap<String,Integer>>();//統計map
BufferedReader bufr = new BufferedReader(new FileReader(docIndex));//讀取docIndex.txt
BufferedWriter bufw = new BufferedWriter(new FileWriter(wordIndex));//寫入到wordIndex.txt
BufferedReader bufrDoc = null;
String docIDandPath = null;
while( (docIDandPath = bufr.readLine()) != null)
{
String[] docInfo = docIDandPath.split("\t");
String docID = docInfo[0]; String docPath = docInfo[1];//獲取到docID和檔案的路徑
bufrDoc = new BufferedReader(new FileReader(docPath));
String wordLine = null;
while( (wordLine = bufrDoc.readLine()) != null)
{
String[] words = wordLine.split("\\W");
for(String wordOfDoc : words)
if(!wordOfDoc.equals(""))
wordDeal(wordOfDoc,docID,tmp);//將從docIndex讀取到對應檔案內容對做統計處理
}
}
//將處理後的結果寫入到wordIndex.txt檔案中
String wordFreInfo = null;
Set<Map.Entry<String,TreeMap<String,Integer>>> entrySet = tmp.entrySet();
Iterator<Map.Entry<String,TreeMap<String,Integer>>> it = entrySet.iterator();
while(it.hasNext())
{
Map.Entry<String,TreeMap<String,Integer>> em = it.next();
wordFreInfo = em.getKey() +"\t" + em.getValue();
bufw.write(wordFreInfo);
bufw.newLine();bufw.flush();
}
bufw.close();
bufr.close();
bufrDoc.close();
}
public static void wordDeal(String wordOfDoc,String docID,TreeMap<String,TreeMap<String,Integer>> tmp)
{
wordOfDoc = wordOfDoc.toLowerCase();
if(!tmp.containsKey(wordOfDoc))
{
//單詞在統計中是首次出現
TreeMap<String , Integer> tmpST = new TreeMap<String , Integer>();
tmpST.put(docID,1);
tmp.put(wordOfDoc,tmpST);
}
else
{//單詞在tmp中已近存在獲取該單詞在對應docID中出現次數,若是首次出現
//count = null,則將(docID ,1)加入到tmpST中;若不是首次出現,則將count++後,再將資訊回寫到tmpST中。
TreeMap<String ,Integer> tmpST = tmp.get(wordOfDoc);
Integer count = tmpST.get(docID);
count = ((count == null) ? 1 : count++);
tmpST.put(docID,count);
tmp.put(wordOfDoc,tmpST); //將最新結果回寫到tmp中
}
}
}