IR中python 寫倒排索引與查詢處理

阿新 • • 發佈：2018-12-22

學習資訊檢索課程，老師讓寫一個倒排索引與查詢處理的程式，於是抱著試試的心態自學python寫了出來。

整個沒有什麼太大的演算法技巧，唯一的就是查詢處理那裡遞迴函式正反兩次反覆查詢需要多除錯下。

資料結構：

#-*-coding:utf-8-*-
#!/usr/bin/python

'''
資料結構
建立索引
mydir   文件列表
onedoc  每一個文件
mydoc   當前查詢的文件

mywords 建立索引的字典
myindex 0 文件下標 1 單詞下標 2 次數 3...
wordcntdict中的個數 doccnt文件個數


三個字典
mywordsdictindex  單詞編號 起始位置
antimywordsdict   單詞編號 結束位置
mywordsdict       單詞->單詞編號

查詢
mypos是每個的單詞起始的index下標
myfindindex是每個單詞的標號，
mydocs 查詢到的文件號

'''
mydir=[]
mywords=[]
myindex=[]

mywordsdictindex={}
antimywordsdict={}
mywordsdict={}

wordcnt=0#dict中的個數
doccnt=0#文件個數
listcnt=0#index個數

mypos=[]
mydocs=[]
myfindindex=[]

mydoc=0
direct=0
print id(mydir)

建立索引：

#-*-coding:utf-8-*-
#!/usr/bin/python

from mydate import *
import sys
import os
import pprint
import pickle
def getmydoc(thepath,onedir):
	ans=[]
	for line in open(thepath+'/'+onedir):
		line=line.strip('\n')
		ans.append(line)
	return ans

def createindex(thepath):
	global mydir
	global mywords
	global myindex
	global mywordsdictindex
	global antimywordsdict
	global mywordsdict
	global wordcnt
	global doccnt
	global listcnt
	global mypos
	global mydocs
	global myfindindex
	global mydoc
	global direct
	mydir=os.listdir(thepath)
	for i in mydir:
		if(os.path.isdir(thepath+'/'+i)==True):
			mydir.remove(i)
	#print mydir
	mydir=['a.txt','b.txt','c.txt']
	wordcnt=0#dict中的個數
	doccnt=0#文件個數
	listcnt=0#index個數
	print id(wordcnt)
	for onedoc in mydir:
		mylist=getmydoc(thepath,onedoc)
		onedocword=0#每個詞在這個文字中的位置
		docworddict={}
		for myword in mylist:
			if(myword not in mywordsdict):
				mywords.append([0]*2)
				mywords[wordcnt][0]=myword
				mywordsdict[myword]=wordcnt
				wordcnt+=1
				#print myword,mywordsdict[myword]
			if(myword not in docworddict):
				docworddict[myword]=listcnt
				listcnt+=1
				myindex.append([0]*3)
			ins=docworddict[myword]
			myindex[ins][0]=doccnt
			myindex[ins][1]=mywordsdict[myword]
			myindex[ins][2]+=1
			myindex[ins].append(onedocword)
			onedocword+=1
		doccnt+=1
	myindex.sort(key=lambda x:x[1]) #sort
	beg=0
	fin=0
	for i in range(len(mywords)):
		mywordsdictindex[mywords[i][0]]=beg
		mywords[i][1]=beg	
		while fin <len(myindex) and myindex[fin][1]==i:#python不支援邏輯短路
			fin+=1
		beg=fin
	for i in range(len(mywords)):
		mywordsdictindex[i]=mywords[i][1]
		if(i==len(mywords)-1):
			antimywordsdict[i]=len(myindex)
		else:
			antimywordsdict[i]=mywords[i+1][1]
'''
	pprint.pprint (mywords)
	pprint.pprint (myindex)
	pprint.pprint (mywordsdict)
	pprint.pprint (mywordsdictindex)
	pprint.pprint (antimywordsdict)
	
	out=open("myindex.dat","wb")
	pickle.dump(myindex,out)
	out=open("mywords.dat","wb")
	pickle.dump(mywords,out)
'''

接收查詢與查詢處理：

#-*-coding:utf-8-*-
#!/usr/bin/python
#得到一個文字的列表
import sys
import os
import pprint
import pickle
import pdb
from mydate import *
'''
返回值三種：1 整個查詢詞都找到了 0 並沒有同時出現在一個文字中 -1 查詢完畢或不存在
mydoc 查詢詞是否都在這個文件中
direct 查詢方向 direct=0 遞歸向下，攜帶標記flag若為1則表明之前一直存在。0表明並不都在一個文字中那麼mydoc取過程中的最大值
      當到len(mypos)的時候，決定是否將該結果放入，並將最後一個詞的mypos後移 改變查詢方向，並返回1
      direct=1 遞迴返回，與0同樣操作，當到第0層再改變查詢方向
'''
def findword(loc,flag):
	global mydir
	global mywords
	global myindex
	global mywordsdictindex
	global antimywordsdict
	global mywordsdict
	global wordcnt
	global doccnt
	global listcnt
	global mypos
	global mydocs
	global myfindindex
	global mydoc
	global direct
	if(loc==len(mypos)):
		#pdb.set_trace()
		direct=1#############################
		if(flag==1):
			mydocs.append(mydoc)
			i=mypos[loc-1]+1
			#print mydocs
			if(i<antimywordsdict[myfindindex[loc-1]]):
				mydoc=myindex[i][0]
			else:
				return -1
		return 1
	i=mypos[loc]
	while i<antimywordsdict[myfindindex[loc]]:
		if(flag==-1):
			return -1
		if(loc==0 and direct==1):
			direct=0
		if( flag==1 and loc==0):
			mydocs.append(mydoc)#############################
			i+=1
			#print mydocs
			if(i<antimywordsdict[myfindindex[loc]]):
				mydoc=myindex[i][0]
			else:
				return 0		
		T=0
		while i<antimywordsdict[myfindindex[loc]] and myindex[i][0]<=mydoc:
			if(myindex[i][0]==mydoc):
				T=1
				break
			i+=1
		if(T==0):
			if(i+1==antimywordsdict[myfindindex[loc]]):
				return -1
			i+=1
			mydoc=myindex[i][0]
		mypos[loc]=i#############################
		if(flag==1 and T==1):
			pass
		else:
			T=0
		if(direct==1):
			return T
		flag=findword(loc+1,T)
	return 0




def getwords():
	global mydir
	global mywords
	global myindex
	global mywordsdictindex
	global antimywordsdict
	global mywordsdict
	global wordcnt
	global doccnt
	global listcnt
	global mypos
	global mydocs
	global myfindindex
	global mydoc
	global direct
	searchword=raw_input("find words\n")
	searchword=searchword.split(' ')
	flag=True
	for i in range(len(searchword)):
		if(searchword[i] not in mywordsdict):
			flag=False
			break
		myfindindex.append(mywordsdict[searchword[i]])#mypos是每個的單詞起始的index下標，myfindindex是每個單詞的標號，三個字典
		mypos.append(mywordsdictindex[searchword[i]])
		
	if(flag==False):
		print 'wrong'
		sys.exit()
	mydoc=myindex[mywordsdictindex[myfindindex[0]]][0]
	direct=0

	import pdb
	#pdb.set_trace()
	flag=findword(0,0)
	print mydocs#mydocs 查詢到的文件號 返回這個資料

使用：

#-*-coding:utf-8-*-
#!/usr/bin/python
import hwf
from mydate import *
import createindex
import sys
import os
import pprint
import pickle


createindex.createindex('.')#建立索引
hwf.getwords()#查詢單詞

IR中python 寫倒排索引與查詢處理

學習資訊檢索課程，老師讓寫一個倒排索引與查詢處理的程式，於是抱著試試的心態自學python寫了出來。整個沒有什麼太大的演算法技巧，唯一的就是查詢處理那裡遞迴函式正反兩次反覆查詢需要多除錯下。資料結構： #-*-coding:utf-8-*- #!/usr/bin/pyt

ES倒排索引與三種Cache詳細介紹

網上看到的一篇文章，對Lucene的倒排索引是如何執行的，說的比較易懂，就轉過來分享下。 Elasticsearch是通過Lucene的倒排索引技術實現比關係型資料庫更快的過濾。特別是它對多條件的過濾支援非常好，比如年齡在18和30之間，性別為女性這樣的組合查詢。倒排索引很多地方都有介紹，但

ES倒排索引與分詞詳解

倒排索引正排索引：文件id到單詞的關聯關係倒排索引：單詞到文件id的關聯關係示例：對以下三個文件去除停用詞後構造倒排索引 image 倒排索引-查詢過程查詢包含“搜尋引擎”的文件通過倒排索引獲得“搜尋引擎”對應的文

倒排索引與分詞

倒排索引正排索引：文件ID到文件內容、單詞的關聯關係倒排索引：單詞到文件ID的關聯關係倒排索引查詢流程：（以查詢包含“搜尋引擎”的文件為例）通過倒排索引獲得“搜尋引擎”對應的文件ID有1和3 通過正排索引查詢1和3的完整內容返回使用者最終

python 實現倒排索引

程式碼如下： #encoding:utf-8 fin = open('1.txt', 'r') ''' 建立正向索引: “文件1”的ID > 單詞1：出現位置列表；單詞2：出現位置列表；…

【Python】倒排索引

程式碼連結預處理 word stemming 一個單詞可能不同的形式，在英語中比如動詞的主被動、單複數等。比如live\lives\lived. 雖然英文的處理看起來已經很複雜啦但實際在中文裡的處理要更加複雜的多。 stop wo

2 Elasticsearch 篇之倒排索引與分詞

文章目錄書的目錄與索引正排與倒排索引簡介倒排索引詳解分詞介紹 analyze_api 自帶分詞器 Standard Analyzer Simple Analyzer W

jieba分詞python建立倒排索引

# encoding=utf-8 import json import jieba from sys import argv from collections import defaultdict path = argv[1] objs = map(lambda s: j

elasticsearch篇之正/倒排索引與分詞

正/倒排索引類似於書的目錄，目錄能夠方便的定位哪一章節或哪一小節的頁碼，但是無法定位某一關鍵字的位置。有一些書的最後有索引頁，它的功能就是幫助定位某些關鍵字出現的位置。目錄頁對應正排索引索引頁對應倒排索引正排索引和倒排索引對於搜尋

elasticsearch倒排索引與TF-IDF演算法

elasticsearch專欄：https://www.cnblogs.com/hello-shf/category/1550315.html 一、倒排索引（Inverted Index）簡介在關係資料庫系統裡，索引是檢索資料最有效率的方式。但對於搜尋引擎，它並不能滿足其特殊要求，比如海量資料下比如百度

【原創】python倒排索引之查詢包含某主題或單詞的檔案

什麼是倒排索引？倒排索引（英語：Inverted index），也常被稱為反向索引、置入檔案或反向檔案，是一種索引方法，被用來儲存在全文搜尋下某個單詞在一個文件或者一組文件中的儲存位置的對映。它是文件檢索系統中最常用的資料結構。通過倒排索引，可以根據單詞快速獲取包含這個單詞的文件列表。倒排索引主要由兩個部分

第三百六十一節，Python分布式爬蟲打造搜索引擎Scrapy精講—倒排索引

索引原理文章根據 file 索引 -i span 需要 style 第三百六十一節，Python分布式爬蟲打造搜索引擎Scrapy精講—倒排索引倒排索引倒排索引源於實際應用中需要根據屬性的值來查找記錄。這種索引表中的每一項都包括一個屬性值和具有該屬性值的各記錄的

搜尋引擎之正排與倒排索引

正排索引（正向索引）正排表是以文件的ID為關鍵字，表中記錄文件中每個字的位置資訊，查詢時掃描表中每個文件中字的資訊直到找出所有包含查詢關鍵字的文件。正排表結構如圖1所示，這種組織方法在建立索引的時候結構比較簡單，建立比較方便且易於維護;因為索引是基於文件建立的，若是有新的文件加入，直接為

python硬剛倒排索引

需要匯入的庫：jieba, json 本程式碼採用直接硬剛倒排索引，可能會引起稍微不適，請選用。程式碼分為三部分：分詞、建立正排索引、建立倒排索引需要檔案：語料庫、停用詞庫語料庫圖片如下：我用的是自己爬取的一部分新聞標題，包含網易，頭條，鳳凰網以及一小部分微信文章標題。語料庫處理：只需要

lucene中倒排索引的記憶體結構

簡介 lucene索引格式是個老生常談的問題，網上也有一些資料，但是由於年代比較古老（大都是基於3.x或者4.x的版本），和現有程式碼較難對上，這裡基於lucene6.6重新講解下，也幫助自己理解和記憶。基本概念這些資訊很容易理解，看程式碼的

python倒排索引

一. 實驗目的 1.掌握列表、集合和字典的定義、賦值、使用等基本操作，熟悉處理複雜資料型別的一般流程 2.熟悉列表、集合和字典的常用函式和技巧 3.考察對文字的靈活處理和對排序演算法的運用二. 實驗內容倒排索引（Inverted index），也常

lucene原始碼分析—倒排索引的寫過程

lucene將倒排索引的資訊寫入.tim和.tip檔案，這部分程式碼也是lucene最核心的一部分。倒排索引的寫過程從BlockTreeTermsWriter的write函式開始， BlockTreeTermsWriter::write public void wri

Python 倒排索引

# -*- coding: utf-8 -*- '''Part 1 : Setup index''' dict = {} # a emtry dictionary. n = 100 for row in range(0,n): information = raw_input()

正排索引(forward index)與倒排索引(inverted index)

一、正排索引（前向索引）正排索引也稱為"前向索引"。它是建立倒排索引的基礎，具有以下欄位。（1）LocalId欄位（表中簡稱"Lid"）：表示一個文件的區域性編號。（2）WordId欄位：表示文件分詞後的編號，也可稱為"索引詞編號"。（3）NHits欄位：

Lucene 初學者實戰（二）正排索引與倒排索引

Lucene：基於傳統全文檢索引擎的倒排索引，並實現了分塊索引。與倒排所引相對立的是正排索引，也成為正向所引。本文將簡單介紹。 1 正排索引（forward index）由key查詢實體的過程，是正排索引. 在搜尋引擎中每個檔案都對應一個檔案ID，檔案內容被表示為一

IR中python 寫倒排索引與查詢處理

相關推薦