爬取攜程頁面酒店資訊並且匯入到HDFS

阿新 • • 發佈：2019-01-19

這裡寫圖片描述

package com.itstar.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import 
 java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.itstar.hadoop.HdfsUtil;

/**
 * 大資料檔案批量採集下載的工具類
 * @author arry 
 * @version 
 v1.0
 * 
 */
public class DataDownUtil {

    /**
     *  根據頁面的網址和網頁的編碼集來獲取網頁的原始碼
     *  @author arry
     *  @param url 網址
     *  @param encoding 網頁的編碼集
     *  @return String 網頁的原始碼
     *  <br /><br />
     *  <a href="http://baidu.com" style="font-size:30px;color:red;">百度一下，你就知道 ！</a>
     *  
     */ 

    public static String getHtmlResourceByURL(String url,String encoding){

        // 儲存原始碼 容器
        StringBuffer buffer = new StringBuffer();
        URL urlObj = null;
        URLConnection uc = null;
        InputStreamReader isr = null;
        BufferedReader reader = null;

        try {
            // 建立網路連結
            urlObj = new URL(url);
            // 開啟網路連線
            uc = urlObj.openConnection();
            // 建立檔案輸入流
            isr = new InputStreamReader(uc.getInputStream(),encoding);
            // 建立檔案緩衝寫入流
            reader = new BufferedReader(isr);

            // 建立臨時變數
            String temp = null;
            while((temp = reader.readLine()) != null){
                buffer.append(temp+"\n"); // 一邊讀，一邊寫
            }

        } catch (MalformedURLException e) {
            e.printStackTrace();
            System.out.println("網路不給力，請檢查設定。");
        } catch (IOException e) {
            e.printStackTrace();
            System.out.println("您的網路連結失敗，親稍後重試 ！");
        } finally{
            if(isr != null){
                try {
                    isr.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        return buffer.toString();

    }

    /**
     * 解析原始碼，獲取酒店資訊
     * @author arry
     * @return List 集合
     * 
     */
    public static List<HashMap<String,Object>> getImgUrl(String html){
        List<HashMap<String,Object>> list = new ArrayList<HashMap<String,Object>>();
        // 解析原始碼，找到需要下載的內容
        Document document = Jsoup.parse(html);
        // 獲取最外層 div  id="hotel_list"
        Element element = document.getElementById("hotel_list");
        // 獲取酒店列表資訊
        Elements elements = document.getElementsByClass("hotel_new_list");

        for(Element ele : elements){
            HashMap<String,Object> map = new HashMap<String,Object>();
            // 獲取酒店圖片
            String imgSrc = ele.getElementsByTag("img").attr("src");
            // 酒店名稱
            String title = ele.getElementsByTag("img").attr("alt");
            // 酒店描述資訊
            String desc = ele.getElementsByClass("hotel_item_htladdress").text();

            System.out.println("圖片："+imgSrc);
            System.out.println("酒店名稱："+title);
            System.out.println("酒店描述資訊："+desc);

            map.put("imgSrc", imgSrc);
            map.put("title", title);
            map.put("desc",desc);

            list.add(map);

            // 下載影象
            getImg("http:"+imgSrc, "D:\\Windows 7 Documents\\Desktop\\Python\\project\\pic\\");


        }       

        return list;

    }


    /**
     * 下載網路圖片 
     * @author arry
     * @param imgUrl 網路圖片的地址
     * @param filePath 伺服器儲存影象的地址
     * @return void 無
     * 
     */
    public static void getImg(String imgUrl,String filePath){

        String fileName = imgUrl.substring(imgUrl.lastIndexOf("/"));

        try{
            // 建立一個伺服器檔案目錄
            File files = new File(filePath);
            if(!files.exists()){
                files.mkdirs();
            }

            // 獲取下載影象的網路連結的地址
            URL urlObj = new URL(imgUrl);
            // 開啟網路連線
            HttpURLConnection connetion = (HttpURLConnection)urlObj.openConnection();
            // 獲取檔案輸入流
            InputStream is = connetion.getInputStream();
            // 建立檔案
            File file = new File(filePath+fileName);
            // 建立檔案輸出流
            FileOutputStream fos = new FileOutputStream(file);

            int temp = 0;
            while((temp = is.read()) != -1){
                fos.write(temp);
            }

            is.close();
            fos.close();

        } catch(Exception e){
            e.printStackTrace();
        }
    }



    // Java入口
    public static void main(String[] arsfddfgs){

        System.out.println("親愛的同學們，大家晚上好，我愛你們  ！");

        String url = "http://hotels.ctrip.com/domestic/showhotellist.aspx?utm_medium=&utm_campaign=&utm_source=&isctrip=&allianceid=13963&sid=457771&ouid=000401app-&txtcity=%c9%cf%ba%a3&city=2&starttime=2017-12-06&deptime=2017-12-08&begprice=&endprice=&rooms=&hotelname=&star=&keyword=&locationid=&zoneid=";
        String encoding = "utf-8";
        // 1. 根據頁面的網址和網頁的編碼集來獲取網頁的原始碼
        String html = getHtmlResourceByURL(url, encoding);
        //System.out.println(html);

        // 3. 下載影象和內容資訊
        List<HashMap<String,Object>> list = getImgUrl(html);

        System.out.println(list);


        // 4. 同步儲存在大資料hadoop中的HDFS分散式檔案系統中

    }


}

獲取的詳細資訊
這裡寫圖片描述

匯入到HDFS中

public class HdfsUtil{
    private static FileSystem fs = null;
    static{
        try{
            //配置檔案宣告
            Configuration conf = new Configuration();
            //配置檔案
            conf.set("fs.defaultFS","localhost");
            //通過API讀取資料
            fs=FileSystem.get(new URI("hdfs://localhost"),conf,"hdfs");
        }catch(Exception e){
            e.printStackTrace();
        }

    }

    /**
     * HDFS 快速檔案上傳 
     * @author arry
     * @throws IOException 
     * @throws IllegalArgumentException 
     * 
     */
    @Test
    public void fileUpload() throws IllegalArgumentException, IOException{

        fs.copyFromLocalFile(new Path("D:\\Windows 7 Documents\\Desktop\\Python\\project\\images\\"), new Path("/arry2018"));

    }
}

爬取攜程頁面酒店資訊並且匯入到HDFS

package com.itstar.util; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io

Python爬取攜程旅遊行程資訊+GIS視覺化

一、需求：爬取攜程旅行網的“北京推薦行程”首頁的各個行程文章，將各個行程所包含的景點資訊提取出來，並匯入ArcGIS進行GIS視覺化。二、爬取思路：爬取北京推薦行程主頁的各個文章的URL，然後通過該URL爬取出行程文章的資料

爬蟲 — 爬取攜程的航班資訊

功能介紹：輸入起點、終點、時間就能得到攜程上的航班資訊程式碼： from prettytable import PrettyTable import requests import json def xiecheng(dcity,acity,date):

利用selenium爬取攜程酒店資訊

上節部落格我們利用requests請求庫，正則表示式來提取資訊（連結https://mp.csdn.net/postedit/81865681），提到過使用selenium也可以抓取酒店資訊，在這裡利用selenium模組優點是不需要資料處理過濾，只需要處理異常，（實際上也是一樣的效果）但是對於

python爬取攜程酒店資料

首先開啟攜程所有北京的酒店http://hotels.ctrip.com/hotel/beijing1 簡簡單單，原始碼中包含我們需要的酒店資料，你以為這樣就結束了？攜程的這些資料這麼廉價地就給我們得到了？事實並不是如此，當我們點選第二頁的時候出現問題：雖然酒店的資料改變了，但是我們發現

Java資料爬取——爬取攜程酒店資料（二）

1.首先思考怎樣根據地域獲取地域酒店資訊，那麼我們看一下攜程上是怎樣獲得的。還是開啟http://hotels.ctrip.com/domestic-city-hotel.html 這個地址，隨便點選一個地區進去（這裡我選取澳門作為示例），點選第二頁資料

使用requests、re、BeautifulSoup、線程池爬取攜程酒店信息並保存到Excel中

備案 info imp lis sub host write count star import requests import json import re import csv import threadpool import time, random

Scrapy爬取攜程桂林問答

guilin.sql： CREATE TABLE `guilin_ask` ( `id` INT(11) NOT NULL AUTO_INCREMENT COMMENT '主鍵', `question` VARCHAR(255) DEFAULT NULL COM

Python爬蟲練手小專案：爬取窮遊網酒店資訊

Python爬蟲練手小專案：爬取窮遊網酒店資訊 Python學習資料或者需要程式碼、視訊加Python學習群：960410445 前言對於初學者而言，案例主要的是為了讓大家練手，明白其中如何這樣寫的思路，而不是拿著程式碼執行就完事了。基本環境配置系統

python中scrapy框架爬取攜程景點資料

--------------------------------------------------------------------------------------------- [版權申明：本文系作者原創，轉載請註明出處] 文章出處：https://blog.cs

爬取攜程和螞蜂窩的景點評論資料\攜程評論資料爬取\旅遊網站資料爬取

本人長期出售超大量微博資料、旅遊網站評論資料，並提供各種指定資料爬取服務，Message to [email protected]。同時歡迎加入社交媒體資料交流群：99918768 前言為了獲取多源資料需要到各個網站獲取一些景點的評論資訊

paython爬取github登入頁面token資訊並登入github

1.語言： Python3.5 2.用到的庫： requests re(正則) 3.流程：手動登入檢視需要的引數，觀察一次登入傳送的請求登入過程如下： ***** *** *

Python爬蟲抓取攜程網機票資訊併發郵件通知

背景：由於要買機票，所以一直進行搜尋，爬蟲可以幫我解決這個問題；解釋的超級詳細。於是通過這一過程，基本瞭解了一些；查詢上海到西安 4.29～05.02的機票： #coding:utf-8 import urllib2 from

用正則表示式從攜程頁面原始碼提取酒店資訊並插入mysql資料庫

file_regEx_mysql_io desc: 從txt檔案中讀取字串，通過正則表示式提取關鍵字，並插入mysql資料庫的一個小demo. 前言因為最近公司需要用到攜程上面的一些酒店資料，由於沒有接觸過爬蟲，所以只能投機取巧去爬html原始檔的程式碼，然後

【圖文詳解】scrapy爬蟲與動態頁面——爬取拉勾網職位資訊（1）

5-14更新注意：目前拉勾網換了json結構，之前是content - result 現在改成了content- positionResult - result,所以大家寫程式碼的時候要特別注意加上

【圖文詳解】scrapy爬蟲與動態頁面——爬取拉勾網職位資訊（2）

上次挖了一個坑，今天終於填上了，還記得之前我們做的拉勾爬蟲嗎？那時我們實現了一頁的爬取，今天讓我們再接再厲，實現多頁爬取，順便實現職位和公司的關鍵詞搜尋功能。之前的內容就不再介紹了，不熟悉的請一定要去看之前的文章，程式碼是在之前的基礎上修改的

python3實現爬取淘寶頁面的商品的資料資訊（selenium+pyquery+mongodb）

1.環境須知做這個爬取的時候需要安裝好python3.6和selenium、pyquery等等一些比較常用的爬取和解析庫，還需要安裝MongoDB這個分散式資料庫。 2.直接上程式碼 spider.py import re from config

爬取貼吧頁面

turn tex max request 完成發送 span fragment 代碼 Get方式 GET請求一般用於我們向服務器獲取數據，比如說，我們用百度搜索傳智播客：https://www.baidu.com/s?wd=傳智播客瀏覽器的url會跳轉成如圖所示

爬蟲框架Scrapy入門——爬取acg12某頁面

ima 需要 random 代碼定義 ons tps 框架 resp 1.安裝1.1自行安裝python3環境1.2ide使用pycharm1.3安裝scrapy框架2.入門案例2.1新建項目工程2.2配置settings文件2.3新建爬蟲app新建app將start_u

利用高德API + Python爬取鏈家網租房資訊 01

看了實驗樓的專案發現五八同城爬取還是有點難度所以轉戰鏈家實驗程式碼如下 from bs4 import BeautifulSoup from urllib.request import urlopen import csv url = 'https://gz.lia

爬取攜程頁面酒店資訊並且匯入到HDFS

相關推薦