1. 程式人生 > >java編寫網頁爬蟲(分頁——插入資料庫——匯出資料)

java編寫網頁爬蟲(分頁——插入資料庫——匯出資料)

)(.*?)()(.*?)()(.*)");
          MatchertdMatcher = tdPattern.matcher(innerTR);
          //彈出層的
          PatterntdPatternPop = Pattern.compile("(.*)()(.*)");
             Matcher m1 =p1.matcher(content);
            if(m1.matches()){
               vo.setMobile(getStringNotNullValue(m1.group(5)));
             }
            //email
             Pattern p2 =Pattern.compile("(.*)(電郵)(.*?)()(.*?)(
)(.*)");
             Matcher m2 =p2.matcher(content);
            if(m2.matches()){
               vo.setEmail(getStringNotNullValue(m2.group(5)));
             }
          //公司URL
                Pattern p3 = Pattern.compile("(.*)(網址)(.*?)(<a href=\")(.*?)(\" target=\"_blank\")(.*)");
                Matcher m3 = p3.matcher(content);
                if(m3.matches()){
                    vo.setUrl(getStringNotNullValue(m3.group(5)));
                }
                //標籤
                Pattern p4 = Pattern.compile("(.*)(產品分類)(.*?)(-)(.*?)(</td>)(.*)");
             Matcher m4 =p4.matcher(content);
            if(m4.matches()){
               vo.setTags(getStringNotNullValue(m4.group(5)));
             }
            list.add(vo);
          }
          i++;
         System.out.println("解析第"+i+"條!!!");
       }
      System.out.println("------------------解析結束-------------------");
       returnlist;
    }

    privatestatic void insertData2DataBase(Listlist, Connection conn)throwsException{
      System.out.println("待插入的資料條數:" + list.size());
       String sql ="insert into test_enterprise_info(enterprise_name,url,email,mobile,tags) values (?,?,?,?,?)";
      PreparedStatement stmt = conn.prepareStatement(sql);
      for(EnterprisInfoVo vo:list){
         stmt.setString(1, vo.getName());
         stmt.setString(2, vo.getUrl());
         stmt.setString(3, vo.getEmail());
         stmt.setString(4, vo.getMobile());
         stmt.setString(5, vo.getTags());
         stmt.addBatch();
       }
      stmt.executeBatch();
      System.out.println("資料插入完畢!!!");
    }
   
    publicstatic Connection connectDataBase() throws SQLException{
       Connectionconn = null;
       String url = "jdbc:mysql://localhost:3306/smartpr?"
               +"user=root&password=root&useUnicode=true&characterEncoding=UTF8";
       try {
           Class.forName("com.mysql.jdbc.Driver");// 動態載入mysql驅動
           System.out.println("成功載入MySQL驅動程式");
           conn = DriverManager.getConnection(url);
       } catch (SQLException e) {
           System.out.println("MySQL操作錯誤");
           e.printStackTrace();
       } catch (Exception e) {
           e.printStackTrace();
       }
       return conn;
    }
   
    publicstatic String getStringNotNullValue(Object object) {
       if (object== null) {
          return"";
       } else{
          returnobject.toString().trim();
       }
    }
}
以上是把網頁上的資料插入到資料庫的過程,這裡需要注意一下java的正則表示式,要注意find()方法和matches方法的區別,find()方法是會自動將符合條件的目標移動到下一個,並且是按照部分匹配的,注意我的程式碼中matches的正則開始和結尾都用到了(.*),而find()方法沒有用;matches()方法剛好相反;


下面把資料庫中的資料匯入到桌面上:
package com;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFFont;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.util.Region;

import com.mysql.jdbc.PreparedStatement;

public class ExcelTest {
   
   
    publicstatic void main(String[] args) throws Exception{
      //1、連線資料庫
       Connectionconn = connectDataBase();
      //2、從資料庫中取資料
       Listlist =getDataFromDataBase(conn);
      //3、把資料插入excel表中
      insertData2Excel(list);
    }
   
    privatestatic ListgetDataFromDataBase(Connection conn) throwsException{
       String sql ="select * from test_enterprise_info";
      PreparedStatement stmt = (PreparedStatement)conn.prepareStatement(sql);
       ResultSet rs= stmt.executeQuery();
       Listlist =new ArrayList();
      while(rs.next()){
         EnterprisInfoVo vo = new EnterprisInfoVo();
          String name= rs.getString(2);
          String url =rs.getString(3);
          String email= rs.getString(4);
          Stringmobile = rs.getString(5);
          String tags= rs.getString(6);
         vo.setEmail(email);
         vo.setMobile(mobile);
         vo.setName(name);
         vo.setTags(getStringNotNullValue(tags).replace("
", ""));
         vo.setUrl(url);
         list.add(vo);
       }
       returnlist;
    }
   
   @SuppressWarnings("deprecation")
    publicstatic void insertData2Excel(Listlist){
       //宣告一個工作薄
       HSSFWorkbook wb = new HSSFWorkbook();
       //宣告一個單子並命名
       HSSFSheet sheet = wb.createSheet("企業資訊表");
       //給單子名稱一個長度
       sheet.setDefaultColumnWidth((short)15);
       // 生成一個樣式 
       HSSFCellStyle style = wb.createCellStyle();
       //建立第一行(也可以稱為表頭)
       sheet.addMergedRegion(new Region(0, (short)0, 0, (short)4));
       HSSFRow row0 = sheet.createRow(0);
       HSSFCell cell0 = row0.createCell(0);
       row0.setHeight((short)500);
       HSSFCellStyle style0 = wb.createCellStyle();
       row0.setRowStyle(style0);
       //設定字型
       HSSFFont font = wb.createFont();
       style0.setFont(font);
       font.setFontHeightInPoints((short)14);
       font.setColor(HSSFFont.BOLDWEIGHT_BOLD);
       //單元格內容
       cell0.setCellValue("企業資訊表");
       HSSFRow row = sheet.createRow(1);
       //樣式字型居中
       style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
       //給表頭第一行一次建立單元格
       HSSFCell cell = row.createCell((short) 0);
       cell.setCellValue("企業名稱");
       cell.setCellStyle(style);
       cell = row.createCell( (short) 1); 
       cell.setCellValue("企業網站URL"); 
       cell.setCellStyle(style); 
       cell = row.createCell((short) 2); 
       cell.setCellValue("企業email"); 
       cell.setCellStyle(style);
       cell = row.createCell((short) 3); 
       cell.setCellValue("企業聯絡電話"); 
       cell.setCellStyle(style);
       cell = row.createCell((short) 4); 
       cell.setCellValue("企業標籤"); 
       cell.setCellStyle(style);
      //向單元格里填充資料
      for (int i = 0; i < list.size(); i++) {
           row = sheet.createRow(i + 2);
           EnterprisInfoVo vo = list.get(i);
           row.createCell(0).setCellValue(vo.getName());
           row.createCell(1).setCellValue(vo.getUrl());
           row.createCell(2).setCellValue(vo.getEmail());
           row.createCell(3).setCellValue(vo.getMobile());
           row.createCell(4).setCellValue(vo.getTags());
       }
        
      try {
           //預設匯出到E盤下
           FileOutputStream out = newFileOutputStream("C://Users//Administrator//Desktop/EnterpriseInfo.xls");
           wb.write(out);
           out.close();
       } catch (FileNotFoundException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       }
    }
   
   
    publicstatic Connection connectDataBase() throws SQLException{
       Connectionconn = null;
       String url = "jdbc:mysql://localhost:3306/smartpr?"
               +"user=root&password=root&useUnicode=true&characterEncoding=UTF8";
       try {
           Class.forName("com.mysql.jdbc.Driver");// 動態載入mysql驅動
           System.out.println("成功載入MySQL驅動程式");
           conn = DriverManager.getConnection(url);
       } catch (SQLException e) {
           System.out.println("MySQL操作錯誤");
           e.printStackTrace();
       } catch (Exception e) {
           e.printStackTrace();
       }
       return conn;
    }
   
    publicstatic String getStringNotNullValue(Object object) {
       if (object== null) {
          return"";
       } else{
          returnobject.toString().trim();
       }
    }
}