1. 程式人生 > >使用Lucene-Spatial實現整合地理位置的全文檢索

使用Lucene-Spatial實現整合地理位置的全文檢索

        Lucene通過Spatial包提供了對基於地理位置的全文檢索的支援,最典型的應用場景就是:“搜尋中關村附近1公里內的火鍋店,並按遠近排序”。使用Lucene-Spatial新增對地理位置的支援,和之前普通文字搜尋主要有兩點區別:

        1. 將座標資訊轉化為笛卡爾層,建立索引

      private void indexLocation(Document document, JSONObject jo)
			throws Exception {

		double longitude = jo.getDouble("longitude");
		double latitude = jo.getDouble("latitude");

		document.add(new Field("lat", NumericUtils
				.doubleToPrefixCoded(latitude), Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		document.add(new Field("lng", NumericUtils
				.doubleToPrefixCoded(longitude), Field.Store.YES,
				Field.Index.NOT_ANALYZED));

		for (int tier = startTier; tier <= endTier; tier++) {
			ctp = new CartesianTierPlotter(tier, projector,
					CartesianTierPlotter.DEFALT_FIELD_PREFIX);
			final double boxId = ctp.getTierBoxId(latitude, longitude);
			document.add(new Field(ctp.getTierFieldName(), NumericUtils
					.doubleToPrefixCoded(boxId), Field.Store.YES,
					Field.Index.NOT_ANALYZED_NO_NORMS));
		}
	}

        2. 搜尋時,指定使用DistanceQueryFilter
DistanceQueryBuilder dq = new DistanceQueryBuilder(latitude,
				longitude, miles, "lat", "lng",
				CartesianTierPlotter.DEFALT_FIELD_PREFIX, true, startTier,
				endTier);
DistanceFieldComparatorSource dsort = new DistanceFieldComparatorSource(
				dq.getDistanceFilter());
Sort sort = new Sort(new SortField("geo_distance", dsort));

      下面是基於Lucene3.2.0和JUnit4.8.2的完整程式碼。
  <dependencies>
  	<dependency>
  		<groupId>junit</groupId>
  		<artifactId>junit</artifactId>
  		<version>4.8.2</version>
  		<type>jar</type>
  		<scope>test</scope>
  	</dependency>
  	<dependency>
  		<groupId>org.apache.lucene</groupId>
  		<artifactId>lucene-core</artifactId>
  		<version>3.2.0</version>
  		<type>jar</type>
  		<scope>compile</scope>
  	</dependency>
  	<dependency>
  		<groupId>org.apache.lucene</groupId>
  		<artifactId>lucene-spatial</artifactId>
  		<version>3.2.0</version>
  		<type>jar</type>
  		<scope>compile</scope>
  	</dependency>
  	<dependency>
  		<groupId>org.json</groupId>
  		<artifactId>json</artifactId>
  		<version>20100903</version>
  		<type>jar</type>
  		<scope>compile</scope>
  	</dependency>
  </dependencies>
        首先準備測試用的資料:
{"id":12,"title":"時尚碼頭美容美髮熱燙特價","longitude":116.3838183,"latitude":39.9629015}
{"id":17,"title":"審美個人美容美髮套餐","longitude":116.386564,"latitude":39.966102}
{"id":23,"title":"海底撈吃300送300","longitude":116.38629,"latitude":39.9629573}
{"id":26,"title":"僅98元!享原價335元李老爹","longitude":116.3846175,"latitude":39.9629125}
{"id":29,"title":"都美造型燙染美髮護理套餐","longitude":116.38629,"latitude":39.9629573}
{"id":30,"title":"僅售55元!原價80元的老舍茶館相聲下午場","longitude":116.0799914,"latitude":39.9655391}
{"id":33,"title":"僅售55元!原價80元的新笑聲客棧早場","longitude":116.0799914,"latitude":39.9655391}
{"id":34,"title":"僅售39元(紅色禮盒)!原價80元的平谷桃","longitude":116.0799914,"latitude":39.9655391}
{"id":46,"title":"僅售38元!原價180元地質禮堂白雪公主","longitude":116.0799914,"latitude":39.9655391}
{"id":49,"title":"僅99元!享原價342.7元自助餐","longitude":116.0799914,"latitude":39.9655391}
{"id":58,"title":"桑海教育暑期學生報名培訓九折優惠券","longitude":116.0799914,"latitude":39.9655391}
{"id":59,"title":"全國發貨:僅29元!貝玲妃超模粉紅高光光","longitude":116.0799914,"latitude":39.9655391}
{"id":65,"title":"海之嶼生態水族用品店抵用券","longitude":116.0799914,"latitude":39.9655391}
{"id":67,"title":"小區東門時尚燙染個人護理美髮套餐","longitude":116.3799914,"latitude":39.9655391}
{"id":74,"title":"《郭德綱相聲專輯》CD套裝","longitude":116.0799914,"latitude":39.9655391}

     根據上面的測試資料,編寫測試用例,分別搜尋座標(116.3838183, 39.96290153千米以內的“美髮”和全部內容,分別得到的結果應該是4條和6條。
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;

import java.util.List;

import org.junit.Test;

public class LuceneSpatialTest {
	
	private static LuceneSpatial spatialSearcher = new LuceneSpatial();

	@Test
	public void testSearch() {
		try {
			long start = System.currentTimeMillis();
			List<String> results = spatialSearcher.search("美髮", 116.3838183, 39.9629015, 3.0);
			System.out.println(results.size()
					+ "個匹配結果,共耗時 "
					+ (System.currentTimeMillis() - start) + "毫秒。\n");
			assertEquals(4, results.size());
		} catch (Exception e) {
			fail("Exception occurs...");
			e.printStackTrace();
		}
	}

	@Test
	public void testSearchWithoutKeyword() {
		try {
			long start = System.currentTimeMillis();
			List<String> results = spatialSearcher.search(null, 116.3838183, 39.9629015, 3.0);
			System.out.println( results.size()
					+ "個匹配結果,共耗時 "
					+ (System.currentTimeMillis() - start) + "毫秒.\n");
			assertEquals(6, results.size());
		} catch (Exception e) {
			fail("Exception occurs...");
			e.printStackTrace();
		}
	}
}

         下面是LuceneSpatial類,在建構函式中初始化變數和建立索引:
public class LuceneSpatial {

	private Analyzer analyzer;
	private IndexWriter writer;
	private FSDirectory indexDirectory;
	private IndexSearcher indexSearcher;
	private IndexReader indexReader;
	private String indexPath = "c:/lucene-spatial";

	// Spatial
	private IProjector projector;
	private CartesianTierPlotter ctp;
	public static final double RATE_MILE_TO_KM = 1.609344; //英里和公里的比率
	public static final String LAT_FIELD = "lat";
	public static final String LON_FIELD = "lng";
	private static final double MAX_RANGE = 15.0; // 索引支援的最大範圍,單位是千米
	private static final double MIN_RANGE = 3.0;  // 索引支援的最小範圍,單位是千米
	private int startTier;
	private int endTier;

	public LuceneSpatial() {
		try {
			init();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private void init() throws Exception {
		initializeSpatialOptions();

		analyzer = new StandardAnalyzer(Version.LUCENE_32);

		File path = new File(indexPath);

		boolean isNeedCreateIndex = false;

		if (path.exists() && !path.isDirectory())
			throw new Exception("Specified path is not a directory");

		if (!path.exists()) {
			path.mkdirs();
			isNeedCreateIndex = true;
		}

		indexDirectory = FSDirectory.open(new File(indexPath));

		//建立索引
		if (isNeedCreateIndex) {
			IndexWriterConfig indexWriterConfig = new IndexWriterConfig(
					Version.LUCENE_32, analyzer);
			indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
			writer = new IndexWriter(indexDirectory, indexWriterConfig);
			buildIndex();
		}

		indexReader = IndexReader.open(indexDirectory, true);
		indexSearcher = new IndexSearcher(indexReader);

	}

	@SuppressWarnings("deprecation")
	private void initializeSpatialOptions() {
		projector = new SinusoidalProjector();
		ctp = new CartesianTierPlotter(0, projector,
				CartesianTierPlotter.DEFALT_FIELD_PREFIX);
		startTier = ctp.bestFit(MAX_RANGE / RATE_MILE_TO_KM);
		endTier = ctp.bestFit(MIN_RANGE / RATE_MILE_TO_KM);
	}



	private int mile2Meter(double miles) {
		double dMeter = miles * RATE_MILE_TO_KM * 1000;

		return (int) dMeter;
	}

	private double km2Mile(double km) {
		return km / RATE_MILE_TO_KM;
	}

              建立索引的具體實現:

	private void buildIndex() {
		BufferedReader br = null;
		try {
			//逐行新增測試資料到索引中,測試資料檔案和原始檔在同一個目錄下
			br = new BufferedReader(new InputStreamReader(
					LuceneSpatial.class.getResourceAsStream("data")));
			String line = null;
			while ((line = br.readLine()) != null) {
				index(new JSONObject(line));
			}

			writer.commit();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

	private void index(JSONObject jo) throws Exception {
		Document doc = new Document();

		doc.add(new Field("id", jo.getString("id"), Field.Store.YES,
				Field.Index.ANALYZED));

		doc.add(new Field("title", jo.getString("title"), Field.Store.YES,
				Field.Index.ANALYZED));

		//將位置資訊新增到索引中
		indexLocation(doc, jo);

		writer.addDocument(doc);
	}

	private void indexLocation(Document document, JSONObject jo)
			throws Exception {

		double longitude = jo.getDouble("longitude");
		double latitude = jo.getDouble("latitude");

		document.add(new Field("lat", NumericUtils
				.doubleToPrefixCoded(latitude), Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		document.add(new Field("lng", NumericUtils
				.doubleToPrefixCoded(longitude), Field.Store.YES,
				Field.Index.NOT_ANALYZED));

		for (int tier = startTier; tier <= endTier; tier++) {
			ctp = new CartesianTierPlotter(tier, projector,
					CartesianTierPlotter.DEFALT_FIELD_PREFIX);
			final double boxId = ctp.getTierBoxId(latitude, longitude);
			document.add(new Field(ctp.getTierFieldName(), NumericUtils
					.doubleToPrefixCoded(boxId), Field.Store.YES,
					Field.Index.NOT_ANALYZED_NO_NORMS));
		}
	}

          搜尋的具體實現:
	public List<String> search(String keyword, double longitude,
			double latitude, double range) throws Exception {
		List<String> result = new ArrayList<String>();

		double miles = km2Mile(range);
		
		DistanceQueryBuilder dq = new DistanceQueryBuilder(latitude,
				longitude, miles, "lat", "lng",
				CartesianTierPlotter.DEFALT_FIELD_PREFIX, true, startTier,
				endTier);

		//按照距離排序
		DistanceFieldComparatorSource dsort = new DistanceFieldComparatorSource(
				dq.getDistanceFilter());
		Sort sort = new Sort(new SortField("geo_distance", dsort));

		Query query = buildQuery(keyword);

		//搜尋結果
		TopDocs hits = indexSearcher.search(query, dq.getFilter(),
				Integer.MAX_VALUE, sort);
		//獲得各條結果相對應的距離
		Map<Integer, Double> distances = dq.getDistanceFilter()
				.getDistances();

		for (int i = 0; i < hits.totalHits; i++) {
			final int docID = hits.scoreDocs[i].doc;

			final Document doc = indexSearcher.doc(docID);

			final StringBuilder builder = new StringBuilder();
			builder.append("找到了: ")
					.append(doc.get("title"))
					.append(", 距離: ")
					.append(mile2Meter(distances.get(docID)))
					.append("米。");
			System.out.println(builder.toString());

			result.add(builder.toString());
		}

		return result;
	}

	private Query buildQuery(String keyword) throws Exception {
		//如果沒有指定關鍵字,則返回範圍內的所有結果
		if (keyword == null || keyword.isEmpty()) {
			return new MatchAllDocsQuery();
		}
		QueryParser parser = new QueryParser(Version.LUCENE_32, "title",
				analyzer);

		parser.setDefaultOperator(Operator.AND);

		return parser.parse(keyword.toString());
	}
       

             執行測試用例,可以得到下面的結果:

找到了: 時尚碼頭美容美髮熱燙特價, 距離: 0米。
找到了: 都美造型燙染美髮護理套餐, 距離: 210米。
找到了: 審美個人美容美髮套餐, 距離: 426米。
找到了: 小區東門時尚燙染個人護理美髮套餐, 距離: 439米。
4個匹配結果,共耗時 119毫秒。

找到了: 時尚碼頭美容美髮熱燙特價, 距離: 0米。
找到了: 僅98元!享原價335元李老爹, 距離: 68米。
找到了: 海底撈吃300送300, 距離: 210米。
找到了: 都美造型燙染美髮護理套餐, 距離: 210米。
找到了: 審美個人美容美髮套餐, 距離: 426米。
找到了: 小區東門時尚燙染個人護理美髮套餐, 距離: 439米。
6個匹配結果,共耗時 3毫秒.

            參考文獻:

            兩篇示例(其中大部分程式碼就來自於這裡):