詞根統計系統 實現背單詞計劃
阿新 • • 發佈:2019-01-03
人生啊,總是在不斷變化,往往會在出其不意的地方出現Bug,對此我們需要萬分小心
在詞根統計的功能上和爬蟲聯絡起來,從 https://www.etymonline.com/ 網站爬取相關的解釋。介面為:
https://www.etymonline.com/search?q=
利用爬蟲進行解析
public class SkillOfWords {
private static Map<String,String> wordfanyicache = new HashMap<String, String>();
private static void getwordfanyicache(String name) throws IOException {
if (wordfanyicache.size() == 0){
name = name + "_fanyi.txt";
File file = new File(name);
if (file.exists()){
InputStream inputStream = new FileInputStream(file);
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
String line = null;
int cnt = 0;
while ((line = bufferedReader.readLine()) != null){
String[] tmp = line.split(" ");
int n = tmp[0].length();
if (n>0){
String word = tmp[0].substring(0,n-1);
if (cnt == 0)System.out.println(word);
String value = "";
if (tmp.length==2){
value = tmp[1];
}
if (word == "" || value == "")continue;
try {
wordfanyicache.put(word, value);
}catch (NullPointerException ue){
}
cnt ++;
}
}
System.out.println("終於讀完了");
}else {
System.out.println("翻譯檔案不存在");
}
}
}
public static void getSkill(String name) throws IOException {
if (wordfanyicache.size() == 0)getwordfanyicache(name);
String nametmp = name;
name = name + ".txt";
File file = new File(name);
if (file.exists()){
InputStream inputStream = new FileInputStream(file);
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
String line = null;
String word = "";
String wordtmp = "";
int cntline = 0;
while ((line = bufferedReader.readLine()) != null){
cntline ++;
wordtmp = wordtmp + line;
if (cntline == 10){
word = word + wordtmp;
wordtmp = "";
cntline = 0;
}
}
word = word + wordtmp;
String[] words = word.split(" ");
int cntword = 0;
String jihua = "";
int cntjihua = getJihuaTian(nametmp);
int totalwords = 0;
for (String url : words){
if (url.length()==0)continue;
cntword ++;
totalwords ++;
if (totalwords < 10*cntjihua){
cntword = 0;
continue;
}
int n = cntword ;
String ans = n + "、" + url;
String w = wordfanyicache.get(url);
url = Link.WORD_DETAIL_BASE.getLink() + url;
ans = ans +": "+url+"意思是: "+w+"\n";
WebEntity webEntity = new WebEntity(url);
Craw craw = Craw.getInstance();
HtmlPage page = craw.parsePage(webEntity);
YeMian yeMian = YeMian.WORD_DETAIL;
if (page != null) {
// TODO: 2018/12/11 解析page並進行儲存,每10個單詞存一個檔案,生成每天的任務
String html = page.asXml();
ans = ans + LabelUtil.analyzeHTMLByString(html, yeMian);
}
jihua = jihua +"\n"+ ans;
if (cntword == 10){
String newname = nametmp + "\\jihua_" + cntjihua + ".txt";
File file1 = new File(newname);
if (!file1.exists()){
file1.createNewFile();
}
FileOutputStream fileOutputStream = new FileOutputStream(file1);
BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream);
bufferedOutputStream.write(jihua.getBytes());
bufferedOutputStream.flush();
bufferedOutputStream.close();
fileOutputStream.close();
System.out.println("第"+cntjihua+"天,生成完成");
saveJihuaTian(cntjihua+1,nametmp);
jihua = "";
cntjihua ++;
cntword = 0;
}
}
}else {
System.out.println("檔案不存在");
}
}
private static void saveJihuaTian(int jihua,String name) throws IOException {
String newname = name + "_jihua_jilu.txt";
File file = new File(newname);
if (!file.exists()){
file.createNewFile();
}
FileOutputStream fileOutputStream = new FileOutputStream(file);
BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream);
String ans = "";
ans = ans + jihua;
bufferedOutputStream.write(ans.getBytes());
bufferedOutputStream.flush();
bufferedOutputStream.close();
fileOutputStream.close();
}
private static int getJihuaTian(String name) throws IOException {
String newname = name + "_jihua_jilu.txt";
File file = new File(newname);
if (file.exists()){
InputStream inputStream = new FileInputStream(file);
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
String line = bufferedReader.readLine();
if (line != null){
int ans = Integer.parseInt(line);
return ans;
}else {
return 0;
}
}else {
return 0;
}
}
}
解析html
public class LabelUtil {
public static String analyzeHTMLByString(String html,YeMian yeMian){
String ans = "";
Document document = Jsoup.parse(html);
if (yeMian == YeMian.WORD_DETAIL){
try {
Element element = document.select(".word--C9UPa").first().select("section").first();
ans = handleHtmlLabel(element.toString());
}catch (NullPointerException ue){
System.out.println("不存在");
}
}
return ans;
}
public static String handleHtmlLabel(String html){
String noHTMLString = "";
html = html.replaceAll("&", "&");
Matcher m = Pattern
.compile("&#(\\d+);", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CANON_EQ)
.matcher(html);
boolean b = false;
int i = 0;
while (m.find()) {
if (i > 500) {
System.out.println(i);
}
i++;
html = html.replace("&#" + m.group(1) + ";", (char) Integer.parseInt(m.group(1)) + "");
b = true;
}
if (!b) {
m = Pattern
.compile("&#x([\\da-f]+);",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CANON_EQ)
.matcher(html);
int j = 0;
while (m.find()) {
if (j > 500) {
System.out.println(j);
}
j++;
html = html.replaceAll("&#[x|X]" + m.group(1) + ";", (char) Integer.parseInt(m.group(1), 16) + "");
}
}
String scl = "<script";//8
String scr = "</script>";//9
int indexl = -1;
indexl = html.indexOf(scl);
long mm = html.length();
while (indexl != -1){
int indexr = -1;
indexr = html.indexOf(scr);
if (indexl != 0){
String x = html.substring(0,indexl);
int n = html.length();
if (indexr != n-9 && indexr != -1) { ;
String y = html.substring(indexr+9,n-1);
html = x+y;
}else if (indexr == n-9 || indexr == -1){
html= x;
}
}else {
int n = html.length();
if (indexr != n-9 && indexr != -1){
String y = html.substring(indexr+9,n-1);
html = y;
}else if(indexr == n-9){
html = "";
}else if(indexr == -1){
html = "";
}
}
indexl = -1;
indexl = html.indexOf(scl);
}
noHTMLString = html.replaceAll("<\\s*(?:br|Br|BR|bR|div|DIV|Div|p|P|td|TD|Td)\\s*(?:[^>])*\\s*>", "")
.replaceAll("", "").replaceAll(" ", "").replaceAll("\\<.*?\\>", "")
.replaceAll("&(?:g|l)t", "");
String x = "";
Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");
Matcher matcher = pattern.matcher(noHTMLString);
x = matcher.replaceAll("");
return noHTMLString.trim();
}
}
功能還沒寫完,待更新