算法之智能搜索(上)
阿新 • • 發佈:2018-11-25
rdma fff () setname sets exce skip entry water 筆者並不了解,各大搜索網站是怎麽實現智能搜索的。以下只是筆者一時的想法,筆者覺得這個方法可以實現智能匹配搜索內容。
一、首先我們獲取細胞詞庫內容
①建表語句:
DROP TABLE IF EXISTS `sougou_ciku`; CREATE TABLE `sougou_ciku` ( `id` varchar(50) NOT NULL, `text` varchar(100) NOT NULL, `below` varchar(50) default NULL, `remark` varchar(100) default NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
②創建映射實體類:
package com.css.java.learning.model; public class SouGouCiKu { private String id;//主鍵 private String text; //內容 private String below;//所屬 private String remark;//備註 public String getId() { return id; } public void setId(String id) { this.id = id; } public String getText() { return text; } public void setText(String text) { this.text = text; } public String getBelow() { return below; } public void setBelow(String below) { this.below = below; } public String getRemark() { return remark; } public void setRemark(String remark) { this.remark = remark; }
}
③創建搜狗scel文件閱讀器:
package com.css.java.learning.massbag; import java.util.List; import java.util.Map; public class SougouScelMdel { private Map<String, List<String>> wordMap; private String name; private String type; private String description; private String sample; public Map<String, List<String>> getWordMap() { return wordMap; } void setWordMap(Map<String, List<String>> wordMap) { this.wordMap = wordMap; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } public String getSample() { return sample; } public void setSample(String sample) { this.sample = sample; } public String getName() { return name; } public void setName(String name) { this.name = name; }
}
package com.css.java.learning.massbag;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class SougouScelReader {
public SougouScelMdel read(File file) throws IOException {
return read(new FileInputStream(file));
}
public SougouScelMdel read(URL url) throws IOException {
return read(url.openStream());
}
protected ByteArrayOutputStream output=new ByteArrayOutputStream();
protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {
int read=reads[0];
input.skip(pos-read);
read=pos;
output.reset();
while(true) {
int c1 = input.read();
int c2 = input.read();
read+=2;
if(c1==0 && c2==0) {
break;
} else {
output.write(c1);
output.write(c2);
}
}
reads[0]=read;
return new String(output.toByteArray(),encoding);
}
protected static String encoding = "UTF-16LE";
public SougouScelMdel read(InputStream in) throws IOException {
SougouScelMdel model = new SougouScelMdel();
DataInputStream input = new DataInputStream(in);
int read;
try {
byte[] bytes = new byte[4];
input.readFully(bytes);
assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
input.readFully(bytes);
int flag1 = bytes[0];
assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
int[] reads=new int[]{8};
model.setName(readString(input,0x130,reads));
model.setType(readString(input,0x338,reads));
model.setDescription(readString(input,0x540,reads));
model.setSample(readString(input,0xd40,reads));
read = reads[0];
input.skip(0x1540 - read);
read=0x1540;
input.readFully(bytes);
read += 4;
assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
bytes = new byte[128];
Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();
while (true) {
int mark = readUnsignedShort(input);
int size = input.readUnsignedByte();
input.skip(1);
read += 4;
assert (size > 0 && (size % 2) == 0);
input.readFully(bytes, 0, size);
read += size;
String py = new String(bytes, 0, size, encoding);
pyMap.put(mark, py);
if ("zuo".equals(py)) {
break;
}
}
if (flag1 == 0x44) {
input.skip(0x2628 - read);
} else if (flag1 == 0x45) {
input.skip(0x26C4 - read);
}
StringBuffer buffer = new StringBuffer();
Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();
while (true) {
int size = readUnsignedShort(input);
if (size < 0) {
break;
}
int count = readUnsignedShort(input);
int len = count / 2;
assert (len * 2 == count);
buffer.setLength(0);
for (int i = 0; i < len; i++) {
int key = readUnsignedShort(input);
buffer.append(pyMap.get(key)).append("‘");
}
buffer.setLength(buffer.length() - 1);
String py = buffer.toString();
List<String> list = wordMap.get(py);
if (list == null) {
list = new ArrayList<String>();
wordMap.put(py, list);
}
for (int i = 0; i < size; i++) {
count = readUnsignedShort(input);
if (count > bytes.length) {
bytes = new byte[count];
}
input.readFully(bytes, 0, count);
String word = new String(bytes, 0, count, encoding);
input.skip(12);
list.add(word);
}
}
model.setWordMap(wordMap);
return model;
} finally {
in.close();
}
}
protected final int readUnsignedShort(InputStream in) throws IOException {
int ch1 = in.read();
int ch2 = in.read();
if ((ch1 | ch2) < 0) {
return Integer.MIN_VALUE;
}
return (ch2 << 8) + (ch1 << 0);
}
}
④搜狗官網下下載細胞詞庫.scel文件
略!
⑤讀取細胞詞庫文件.scel插入數據庫
private static void sogou(String path) throws IOException{
File file=new File(path);
SougouScelMdel model = new SougouScelReader().read(file);
Map<String,List<String>> words = model.getWordMap(); //詞<拼音,詞>
Set<Entry<String,List<String>>> set = words.entrySet();
Iterator<Entry<String,List<String>>> iter = set.iterator();
while(iter.hasNext()){
Entry<String,List<String>> entry = iter.next();
List<String> list = entry.getValue();
int size = list.size();
for(int i = 0; i < size; i++){
String word = list.get(i);
/*判斷,該詞是否在數據庫中出現,無則加之有則不做處理
* 此處方法不做呈現
*/
boolean is_exit = jugeWord(word);
if(is_exit) {
/*將該詞,插入到數據庫中,供後續使用
* 此方法亦不做呈現
*/
insert(word);
}
System.out.println(word);
}
}
}
⑥執行搜狗細胞詞庫插入數據庫
筆者以下面的文件為例:
得到以下等數據
下篇講解,筆者自創的簡單算法,拆分輸入語句匹配詞庫完成搜索過程。
算法之智能搜索(上)