1. 程式人生 > >詞根統計系統 實現背單詞計劃

詞根統計系統 實現背單詞計劃

人生啊,總是在不斷變化,往往會在出其不意的地方出現Bug,對此我們需要萬分小心

在詞根統計的功能上和爬蟲聯絡起來,從 https://www.etymonline.com/ 網站爬取相關的解釋。介面為:

https://www.etymonline.com/search?q=

利用爬蟲進行解析

public class SkillOfWords {


    private static Map<String,String> wordfanyicache = new HashMap<String, String>();

    private static
void getwordfanyicache(String name) throws IOException { if (wordfanyicache.size() == 0){ name = name + "_fanyi.txt"; File file = new File(name); if (file.exists()){ InputStream inputStream = new FileInputStream(file); BufferedReader bufferedReader =
new BufferedReader(new InputStreamReader(inputStream)); String line = null; int cnt = 0; while ((line = bufferedReader.readLine()) != null){ String[] tmp = line.split(" "); int n = tmp[0].length(); if
(n>0){ String word = tmp[0].substring(0,n-1); if (cnt == 0)System.out.println(word); String value = ""; if (tmp.length==2){ value = tmp[1]; } if (word == "" || value == "")continue; try { wordfanyicache.put(word, value); }catch (NullPointerException ue){ } cnt ++; } } System.out.println("終於讀完了"); }else { System.out.println("翻譯檔案不存在"); } } } public static void getSkill(String name) throws IOException { if (wordfanyicache.size() == 0)getwordfanyicache(name); String nametmp = name; name = name + ".txt"; File file = new File(name); if (file.exists()){ InputStream inputStream = new FileInputStream(file); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); String line = null; String word = ""; String wordtmp = ""; int cntline = 0; while ((line = bufferedReader.readLine()) != null){ cntline ++; wordtmp = wordtmp + line; if (cntline == 10){ word = word + wordtmp; wordtmp = ""; cntline = 0; } } word = word + wordtmp; String[] words = word.split(" "); int cntword = 0; String jihua = ""; int cntjihua = getJihuaTian(nametmp); int totalwords = 0; for (String url : words){ if (url.length()==0)continue; cntword ++; totalwords ++; if (totalwords < 10*cntjihua){ cntword = 0; continue; } int n = cntword ; String ans = n + "、" + url; String w = wordfanyicache.get(url); url = Link.WORD_DETAIL_BASE.getLink() + url; ans = ans +": "+url+"意思是: "+w+"\n"; WebEntity webEntity = new WebEntity(url); Craw craw = Craw.getInstance(); HtmlPage page = craw.parsePage(webEntity); YeMian yeMian = YeMian.WORD_DETAIL; if (page != null) { // TODO: 2018/12/11 解析page並進行儲存,每10個單詞存一個檔案,生成每天的任務 String html = page.asXml(); ans = ans + LabelUtil.analyzeHTMLByString(html, yeMian); } jihua = jihua +"\n"+ ans; if (cntword == 10){ String newname = nametmp + "\\jihua_" + cntjihua + ".txt"; File file1 = new File(newname); if (!file1.exists()){ file1.createNewFile(); } FileOutputStream fileOutputStream = new FileOutputStream(file1); BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream); bufferedOutputStream.write(jihua.getBytes()); bufferedOutputStream.flush(); bufferedOutputStream.close(); fileOutputStream.close(); System.out.println("第"+cntjihua+"天,生成完成"); saveJihuaTian(cntjihua+1,nametmp); jihua = ""; cntjihua ++; cntword = 0; } } }else { System.out.println("檔案不存在"); } } private static void saveJihuaTian(int jihua,String name) throws IOException { String newname = name + "_jihua_jilu.txt"; File file = new File(newname); if (!file.exists()){ file.createNewFile(); } FileOutputStream fileOutputStream = new FileOutputStream(file); BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream); String ans = ""; ans = ans + jihua; bufferedOutputStream.write(ans.getBytes()); bufferedOutputStream.flush(); bufferedOutputStream.close(); fileOutputStream.close(); } private static int getJihuaTian(String name) throws IOException { String newname = name + "_jihua_jilu.txt"; File file = new File(newname); if (file.exists()){ InputStream inputStream = new FileInputStream(file); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); String line = bufferedReader.readLine(); if (line != null){ int ans = Integer.parseInt(line); return ans; }else { return 0; } }else { return 0; } } }

解析html

public class LabelUtil {
    public static String analyzeHTMLByString(String html,YeMian yeMian){
        String ans = "";
        Document document = Jsoup.parse(html);
        if (yeMian == YeMian.WORD_DETAIL){
            try {
                Element element = document.select(".word--C9UPa").first().select("section").first();
                ans = handleHtmlLabel(element.toString());
            }catch (NullPointerException ue){
                System.out.println("不存在");
            }
        }
        return ans;
    }
    public static String handleHtmlLabel(String html){
        String noHTMLString = "";
        html = html.replaceAll("&amp;", "&");
        Matcher m = Pattern
                .compile("&#(\\d+);", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CANON_EQ)
                .matcher(html);
        boolean b = false;
        int i = 0;
        while (m.find()) {
            if (i > 500) {
                System.out.println(i);
            }
            i++;
            html = html.replace("&#" + m.group(1) + ";", (char) Integer.parseInt(m.group(1)) + "");
            b = true;
        }
        if (!b) {
            m = Pattern
                    .compile("&#x([\\da-f]+);",
                            Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CANON_EQ)
                    .matcher(html);
            int j = 0;
            while (m.find()) {
                if (j > 500) {
                    System.out.println(j);
                }
                j++;
                html = html.replaceAll("&#[x|X]" + m.group(1) + ";", (char) Integer.parseInt(m.group(1), 16) + "");
            }
        }
        String scl = "<script";//8
        String scr = "</script>";//9
        int indexl = -1;
        indexl = html.indexOf(scl);
        long mm = html.length();
        while (indexl != -1){
            int indexr = -1;
            indexr = html.indexOf(scr);
            if (indexl != 0){
                String x = html.substring(0,indexl);
                int n = html.length();
                if (indexr != n-9 && indexr != -1) { ;
                    String y = html.substring(indexr+9,n-1);
                    html = x+y;

                }else if (indexr == n-9 || indexr == -1){
                    html= x;

                }
            }else {
                int n = html.length();
                if (indexr != n-9 && indexr != -1){
                    String y = html.substring(indexr+9,n-1);
                    html = y;

                }else if(indexr == n-9){
                    html = "";
                }else if(indexr == -1){
                    html = "";
                }
            }
            indexl = -1;
            indexl = html.indexOf(scl);
        }
        noHTMLString = html.replaceAll("<\\s*(?:br|Br|BR|bR|div|DIV|Div|p|P|td|TD|Td)\\s*(?:[^>])*\\s*>", "")
                .replaceAll("", "").replaceAll("&nbsp;", "").replaceAll("\\<.*?\\>", "")
                .replaceAll("&(?:g|l)t", "");
        String x = "";
        Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");
        Matcher matcher = pattern.matcher(noHTMLString);
        x = matcher.replaceAll("");
        return noHTMLString.trim();
    }
}

在這裡插入圖片描述

功能還沒寫完,待更新