Java爬蟲高階版(今日頭條)

阿新 • • 發佈：2019-02-18

宣告：浙大java課程小作業
作者：GeSq

- - - 功能描述
    - UI介面
    - 結果
    - 邏輯
    - 程式碼

功能描述

爬取今日頭條文章的圖片和正文文字。僅適用與頭條文章版網頁，不支援相簿版網頁。

UI介面

這裡寫圖片描述

匯出目錄：自己填寫匯出目錄。如果不填，預設是當前目錄。

點選按鈕進行爬取。

結果

這裡寫圖片描述

邏輯：

輸入url，爬取對應url裡的HTML檔案，從中篩選出image的url放入List中，然後逐個下載到本地資料夾。爬取對應文字，然後寫入本地文件。

程式碼

Main.java

import java.io.ByteArrayOutputStream; 

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
public class Main{
    public static void main(String[] args) {
        SwingDemo demo = new SwingDemo();
    }
}

SwingDemo.java

import sun.java2d.loops.FillPath;

import 
 javax.swing.*;
import java.awt.*;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.File;

/**
 * Created by geshuaiqi on 2017/11/9.
 */
public class SwingDemo extends JFrame
{
    private String mHttpurl; // 網路url
    private String mLocalurl;// 本地url
    private String mFilePath;

    public 
 String getmLocalurl(){
        return  mLocalurl;
    }

    public String getmHttpurl(){
        return mHttpurl;
    }

    public String getmFilePath(){
        return mFilePath;
    }

    public SwingDemo()
    {
        super("頭條爬蟲助手");
        JFrame frame =new JFrame("頭條爬蟲助手"); //設定標題
        frame.setSize(300,120); //設定視窗大小
        JPanel panel_up=new JPanel();   // JFrame 裡上下兩個部分
        JPanel panel_bottom=new JPanel();

        frame.setLayout(new BorderLayout());
        frame.add(panel_up,BorderLayout.CENTER);    // 輸入內容居中
        frame.add(panel_bottom,BorderLayout.SOUTH); // 按鍵在下方


        panel_up.setLayout(new GridLayout(2,2));    // 上方panel 2*2佈局

        JLabel HttpLabel =new JLabel("網址");
        JPanel panel_http_content =new JPanel();
        JLabel catalog =new JLabel("匯出目錄");
        JPanel panel_content =new JPanel();

        panel_up.add(HttpLabel);
        panel_up.add(panel_http_content);
        panel_up.add(catalog);
        panel_up.add(panel_content);


        JTextField LocalUrl=new JTextField(10);     // 本地儲存位置輸入

        panel_content.setLayout(new GridLayout());
        panel_content.add(LocalUrl);

        JTextField HttpUrl=new JTextField(10);      // 網頁http url

        panel_http_content.setLayout(new GridLayout());
        panel_http_content.add(HttpUrl);

        panel_bottom.setLayout(new FlowLayout());           // 按鈕佈局

        JButton btn_pic = new JButton("匯出圖片/文字");
        //JButton btn_text = new JButton("匯出文字");
        panel_bottom.add(btn_pic);
        //panel_bottom.add(btn_text);

        frame.setVisible(true);

        btn_pic.addActionListener(new ActionListener() {
            @Override
            public void actionPerformed(ActionEvent arg0) {
                mHttpurl = HttpUrl.getText().trim();    // 獲取輸入內容
                mLocalurl = LocalUrl.getText().trim();

                if(mLocalurl.length() == 0){
                    mLocalurl = "./";
                }

                CreateFile(mLocalurl);

                Crawler my = new Crawler(mHttpurl, mFilePath);

                Thread t = new Thread(my);
                t.start();

            }
        });
//        btn_text.addActionListener(new ActionListener() {
//            @Override
//            public void actionPerformed(ActionEvent arg1) {
//                System.out.println("Text");
//            }
//        });
    }

    // 建立資料夾
    void CreateFile(String FilePath){
        File file = null;
        File textfile = null;
        File picfile = null;
        FilePath = FilePath + "/頭條爬蟲助手";

        String tmpPath = FilePath;

        file = new File(tmpPath);
        int count = 1;
        while(file.exists()){       // 為避免檔名重複
            tmpPath = FilePath + "_" + Integer.toString(count);
            count++;
            file = new File(tmpPath);
        }
        FilePath = tmpPath;
        mFilePath = FilePath;

        String textdir = FilePath + "/文字庫";
        String picdir = FilePath + "/圖片庫";

        // 建立資料夾
        try {
            file = new File(FilePath);
            if (!file.exists()) {
                System.out.println("成功建立資料夾: "+FilePath);
                file.mkdirs();
            }
            else{
                System.out.println("資料夾已經存在: "+FilePath);
            }
            file = new File(picdir);
            if (!file.exists()) {
                System.out.println("成功建立資料夾: "+picdir);
                file.mkdirs();
            }
            else{
                System.out.println("資料夾已經存在: "+picdir);
            }
            file = new File(textdir);
            if (!file.exists()) {
                System.out.println("成功建立資料夾: "+textdir);
                file.mkdirs();
            }
            else{
                System.out.println("資料夾已經存在: "+textdir);
            }


        } catch (Exception e) {
            System.out.println("建立資料夾失敗: "+FilePath);
        } finally {
            file = null;
        }
    }

    public static void main(String[] args)
    {
        SwingDemo t = new SwingDemo();
    }


}

Crawler.java

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;

/**
 * Created by geshuaiqi on 2017/11/9.
 */
public class Crawler implements Runnable {
    private String mHttpurl;
    private String mLocalurl;
    List<String> mPicUrlList;
    private String Article;

    // 傳入網址url，以及本地匯出目錄。如果不填目錄，則預設為當下目錄
    Crawler(String url,String address_url){
        mHttpurl = url;
        mLocalurl = address_url;

        GetWebContent Content = new GetWebContent(mHttpurl);
        mPicUrlList = Content.GetPicUrl(); // 從網頁中抽取目標圖表的url
        for(int i = 0; i< mPicUrlList.size(); i++){
            System.out.println(mPicUrlList.get(i));
        }
        Article = Content.getText();
    }


    // 執行緒執行檔案
    public void run(){
        System.out.println("開始下載圖片");

        for(int i=0;i<mPicUrlList.size();i++){
            String pathname = mLocalurl + "/圖片庫/"+(i+1)+".jpg";
            getPic(mPicUrlList.get(i),pathname);
            System.out.println("已完成:"+(i+1)+".jpg ， 共"+(i+1)+"/"+mPicUrlList.size()+"張");
        }
        System.out.println("全部下載完成，共" + mPicUrlList.size() + "張");

        String textpath = mLocalurl + "/文字庫/文字.txt";
        //System.out.println(Article);
        contentToTxt(textpath,Article); // 把文章輸入到指定文字檔案中
    }

    public static void contentToTxt(String filePath, String content) {
        try{
            BufferedWriter writer = new BufferedWriter(new FileWriter(new File(filePath),true));
            writer.write("\n"+content);
            writer.close();
        }catch(Exception e){
            e.printStackTrace();
        }
    }

    // 下載圖片

    public void getPic(String strUrl,String pathname) {
        try {
            //構造URL
            URL url = new URL(strUrl);

            //構造連線
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();

            //這個網站要模擬瀏覽器才行
            conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko");

            //開啟連線
            conn.connect();
            //開啟這個網站的輸入流
            InputStream inStream = conn.getInputStream();

            //用這個做中轉站 ，把圖片資料都放在了這裡，再呼叫toByteArray()即可獲得資料的byte陣列
            ByteArrayOutputStream outStream = new ByteArrayOutputStream();
            //用這個是很好的，不用一次就把圖片讀到了檔案中
            //要是需要把圖片用作其他用途呢？所以直接把圖片的資料弄成一個變數，十分有用
            //相當於操作這個變數就能操作圖片了

            byte[] buf = new byte[1024];
            //為什麼是1024？
            //1024Byte=1KB，分配1KB的快取
            //這個就是迴圈讀取，是一個臨時空間，多大都沒關係
            //這沒有什麼大的關係，你就是用999這樣的數字也沒有問題，就是每次讀取的最大位元組數。
            //byte[]的大小，說明你一次操作最大位元組是多少
            //雖然讀的是9M的檔案，其實你的記憶體只用1M來處理，節省了很多空間．
            //當然，設得小，說明I/O操作會比較頻繁，I/O操作耗時比較長，
            //這多少會有點效能上的影響．這看你是想用空間換時間，還是想用時間換空間了．
            //時間慢總比記憶體溢位程式崩潰強．如果記憶體足夠的話，我會考慮設大點．
            int len = 0;
            //讀取圖片資料
            while ((len = inStream.read(buf)) != -1) {
                outStream.write(buf, 0, len);
            }
            inStream.close();
            outStream.close();
            //把圖片資料填入檔案中
            File file = new File(pathname);  // 建立空的圖片檔案

            FileOutputStream op = new FileOutputStream(file); // 目標為圖片的輸出流

            op.write(outStream.toByteArray());

            op.close();
        } catch (Exception e) {
            System.out.println("Exception");
        }

    }


}

GetWebContent.java

/**
 * Created by geshuaiqi on 2017/11/10.
 */
import com.sun.org.apache.xerces.internal.xs.StringList;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class GetWebContent {
    private static String Html;

    GetWebContent(String url){
        Html = getWebCon(url);
    }

    private static String getWebCon(String domain) {
        // System.out.println("開始讀取內容...("+domain+")");
        StringBuffer sb = new StringBuffer();
        try {
            java.net.URL url = new java.net.URL(domain);
            BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
            String line;
            while ((line = in.readLine()) != null) {
                sb.append(line);
            }
            //System.out.println(sb.toString());
            in.close();
        } catch (Exception e) { // Report any errors that arise
            sb.append(e.toString());
            System.err.println(e);
            System.err.println("Usage:   java   HttpClient   <URL>   [<filename>]");
        }
        return sb.toString();
    }

    private static String RegexString(String targetStr, String patternStr)
    {
        // 定義一個樣式模板，此中使用正則表示式，括號中是要抓的內容
        // 相當於埋好了陷阱匹配的地方就會掉下去
        Pattern pattern = Pattern.compile(patternStr);
        // 定義一個matcher用來做匹配
        Matcher matcher = pattern.matcher(targetStr);
        // 如果找到了
        if (matcher.find())
        {
            // 打印出結果
            return matcher.group(0);
        }
        return "Nothing";
    }

    public static String getText(){
        StringBuffer buffer = new StringBuffer();
        String regex="articleInfo(.*)commentInfo";
        String Article = RegexString(Html,regex);
        String Content = new String();

        String regexStr = "[\u4E00-\u9FA5]*";

        char[] t = Article.toCharArray();

        String text = new String();
        for(int i=0;i<Article.length();i++){
            if( (t[i] >= '\u4E00' && t[i]<='\u9FA5')  || t[i] == '，' || t[i] =='。' || t[i] =='~' || t[i] =='？' || t[i] =='！' || t[i] =='\n' ){
                text += t[i];
                if(t[i] == '。'){
                    text += '\n';
                }
            }
        }
//        Pattern pattern = Pattern.compile(regexStr);
//        // 定義一個matcher用來做匹配
//        Matcher matcher = pattern.matcher(Article);
//        while (matcher.find())
//        {
//           text += matcher.group(0);
//        }


        Content += text;
        return Content;
    }

    /*
        引數：今日頭角網址url
        返回：內容圖片的url list
     */
    public static List<String> GetPicUrl() {
        //String Html = getWebCon("https://www.toutiao.com/a6486290077252059661/");
        //String Html = getWebCon("https://www.toutiao.com/a6486288385974469134/");
        //String Html = getWebCon(url);

        StringBuffer buffer = new StringBuffer();
        String regex="articleInfo(.*)commentInfo";
        String Article = RegexString(Html,regex);
        Pattern pattern = Pattern.compile("//(.*?)&quot");
        // 定義一個matcher用來做匹配
        Matcher matcher = pattern.matcher(Article);

        List<String> urlList = new ArrayList(); // 圖片的url陣列
        while (matcher.find()){ // 找到所有相關圖片的url，然後幾種到list中
            String tmp = "http:" + matcher.group(0).replace("&quot",""); // 陣列做一下處理
            urlList.add(tmp);
        }

//        for(int i=0; i<urlList.size(); i++){
//            System.out.println(urlList.get(i));
//        }

        return urlList;

        //System.out.println(Html);

    }
}

Java爬蟲高階版(今日頭條)

宣告：浙大java課程小作業作者：GeSq 功能描述 UI介面結果邏輯程式碼功能描述爬取今日頭條文章的圖片和正文文字。僅適用與頭條文章版網頁，不支援相簿版網頁。 UI介面匯出目錄：自己填寫匯

python爬蟲爬取今日頭條APP資料（無需破解as ,cp，_cp_signature引數）

#!coding=utf-8 import requests import re import json import math import random import time from requests.packages.urllib3.exceptions import Insecure

PHP版今日頭條演算法面試題（持續更新）

1，現在有一個字串，你要對這個字串進行 n 次操作，每次操作給出兩個數字：(p, l) 表示當前字串中從下標為 p 的字元開始的長度為 l 的一個子串。你要將這個子串左右翻轉後插在這個子串原來位置的正後方，求最後得到的字串是什麼。字串的下標是從 0 開始的，你可以從樣例中得

Java爬蟲實踐：Jsoup+HttpUnit爬取今日頭條、網易、搜狐、鳳凰新聞

0x0 背景最近學習爬蟲，分析了幾種主流的爬蟲框架，決定使用最原始的兩大框架進行練手： Jsoup&HttpUnit 其中jsoup可以獲取靜態頁面，並解析頁面標籤，最主要的是，可以採用類似於jquery的語法獲取想要的標籤元素，例如： /

java爬蟲系列（五）——今日頭條文章爬蟲實戰

專案原始碼爬蟲目標爬取某一頭條號下面所有文章。爬蟲設計思路爬取方式動態解析網頁方式爬取之前介紹過使用webdriver的方式爬取網頁內容，這樣做的話好處非常明顯，只需要考慮如何解析網頁的element標籤就行

今日頭條爬蟲

comm bsp .html __main__ true lan 3.0 from iges 今日頭條是一個js動態加載的網站，嘗試了兩種方式爬取，一是頁面直接提取，一是通過接口提取： version1：直接頁面提取 #coding=utf-8 #今日頭條 from lx

爬蟲實戰【6】Ajax內容解析-今日頭條圖集

就是 get請求加載執行搜索 parse 編程滾動 from Ajax技術 AJAX = Asynchronous JavaScript and XML（異步的 JavaScript 和 XML）。 Ajax並不是新的編程語言，而是一種使用現有標準的新方法，當然也不

今日頭條移動app廣告激活數據API對接完整Java代碼實現供大家參考》》》項目隨記

blank title gpo 今日頭條引流推廣方式好習慣代碼實現這是自畢業後的第一篇博客，希望自己今後能養成寫博客的一個好習慣。最近公司為了加速APP推廣，采取在外部平臺（如：今日頭條）進行廣告投放的方式，進行用戶引流。因此我們需要對廣告的激活數據進行一個檢測

2018最新Java實戰開發今日頭條資訊網站

工具下載創建安全性 method 簡單 thread redis 用戶註冊 ==================課程目錄=====================第1節開發工具和Java語言介紹主要介紹項目所需要的開發工具，並且會簡單回顧這個項目所用到的語言-jav

python --爬蟲基礎 --爬取今日頭條使用 requests 庫的基本操作, Ajax

'''思路一: 由於是Ajax的網頁,需要先往下劃幾下看看XHR的內容變化二:分析js中的程式碼內容三:獲取一頁中的內容四:獲取圖片五:儲存在本地使用的庫1. requests 網頁獲取庫 2.from urllib.parse import urlencode 將字典轉化為字串內容整

(爬蟲)採用BeautifulSoup和正則爬取今日頭條圖集.詳細!

用beautifulsoup提取文字資訊,正則匹配關鍵的圖片資訊. 最後存入資料庫mongodb. 完成後的感想: 其實分析網頁是最關鍵的一個環節. ajax分析,json處理等等,還是需要多點練習. 下面是程式碼: ''' 步驟: 1. 首先抓取索引頁的內容,

今日頭條極速版，教你看新聞如何獲得零錢！邀請碼CYUUWLV8

如今有很多手機端的新聞APP的運營模式，都是採用吸引人流量進行上市！然而，使用者可以邊看新聞邊賺零花錢，例如：今日頭條極速版、天天快報、趣頭條等等。很多網友不知道怎麼用，例如：今日頭條極速版邀請碼是什麼？如何才能獲取收益？等問題！今日頭條極速版邀請碼：CYUUWLV8 要了解更多的可以收益

今日頭條極速版邀請碼以及其它APP邀請碼大全

現在大多手機新聞APP都需要輸入碼，在網上找了很久，最終找到一個比較全的文章，本人試過，都是可以使用的！第6個比較好，可邊看新聞，邊收益！嘻嘻！平時寫程式碼累了，休息刷一下！或者在睡覺前重新整理一下，每天積累，夠買早餐！ 1.天天快報（邀請碼） QAXTRN 2.趣頭條（邀請碼） A186964988