nutch2.3.1 構建solr6索引時meta_keywords longer than the max length 32766
阿新 • • 發佈:2019-01-31
解決辦法有3
1是在managed schema置meta_* 的index=false
2是在managed schema置meta_* 的type=任意一種class是solr.TextField的型別
3是修改nutch程式碼MetaTagsParser.java如下
private void addIndexedMetatags(Map<CharSequence, ByteBuffer> metadata,
String metatag, String value) {
//add here
if(value.getBytes("utf-8" ).length > 32765) return;
String lcMetatag = metatag.toLowerCase(Locale.ROOT);
if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
}
metadata.put(new Utf8(PARSE_META_PREFIX + lcMetatag),
ByteBuffer.wrap(value .getBytes()));
}
}
3 如果資料庫已經存在過長資料,需要在index時過濾,修改檔案: SolrIndexWriter.java
@Override
public void write(NutchDocument doc) throws IOException {
final SolrInputDocument inputDoc = new SolrInputDocument();
for (final Entry<String, List<String>> e : doc) {
for (final String val : e.getValue ()) {
Object val2 = val;
if (e.getKey().equals("content") || e.getKey().equals("title")) {
val2 = SolrUtils.stripNonCharCodepoints(val);
}
if(e.getKey().startsWith("meta_") && val.getBytes("utf-8").length > 32765){
LOG.warn("trim too long value for key:" + e.getKey());
continue;
}
inputDoc.addField(solrMapping.mapKey(e.getKey()), val2);
String sCopy = solrMapping.mapCopyKey(e.getKey());
if (sCopy != e.getKey()) {
inputDoc.addField(sCopy, val2);
}
}
}
inputDoc.setDocumentBoost(doc.getScore());
inputDocs.add(inputDoc);
documentCount++;
if (inputDocs.size() >= batchSize) {
try {
LOG.info("Adding " + Integer.toString(inputDocs.size()) + " documents");
solr.add(inputDocs);
} catch (final SolrServerException e) {
throw new IOException(e);
}
inputDocs.clear();
}
}