1. 程式人生 > 其它 >爬取拉勾網職位等資訊(Java)

爬取拉勾網職位等資訊(Java)

工具:火狐瀏覽器,selenium IDE(3版本往上),Eclipse,selenium-java.jar(需匯入),selenium-server-standalone-3.141.5.jar(需匯入),poi-bin-5.2.2

  1 package one;
  2 
  3 import java.io.File;
  4 import java.io.FileOutputStream;
  5 import java.util.List;
  6 
  7 import org.apache.poi.hssf.usermodel.HSSFRow;
  8 import org.apache.poi.hssf.usermodel.HSSFSheet;
9 import org.apache.poi.hssf.usermodel.HSSFWorkbook; 10 import org.apache.poi.ss.usermodel.Workbook; 11 import org.openqa.selenium.By; 12 import org.openqa.selenium.WebDriver; 13 import org.openqa.selenium.WebDriver.Navigation; 14 import org.openqa.selenium.WebElement; 15 import org.openqa.selenium.firefox.FirefoxDriver;
16 17 public class search_lagouwang { 18 19 public static void main(String[] args) throws InterruptedException { 20 21 String s = "",job="",company="",money="",experience="",whole=""; 22 int i=1; 23 int j=1; //i表示表格第幾行,j表示網頁第幾頁 24 25 //建立工作表,需要匯出到檔案
26 HSSFWorkbook workbook = new HSSFWorkbook(); 27 HSSFSheet sheet = workbook.createSheet(); 28 HSSFRow row = sheet.createRow(0); 29 row.createCell(0).setCellValue("職位"); 30 row.createCell(1).setCellValue("公司"); 31 row.createCell(2).setCellValue("薪資"); 32 row.createCell(3).setCellValue("工作經驗"); 33 34 System.setProperty ( "webdriver.firefox.bin" , "E:\\Mozilla Firefox\\firefox.exe" ); //需匯入一堆.jar 35 System.setProperty("webdriver.gecko.driver", "E:\\Mozilla Firefox\\geckodriver.exe"); 36 //selenium3中沒有火狐啟動驅動,需要重新下載geckodriver.exe 37 WebDriver driver = new FirefoxDriver(); 38 Navigation navigation = driver.navigate(); 39 navigation.to("https://www.lagou.com/"); 40 Thread.sleep(3000); 41 42 driver.findElement(By.id("cboxClose")).click(); //點選彈出上面的X按鈕 43 Thread.sleep(3000); //需要等待幾秒,不然時間太快輸入框獲取不到 44 driver.findElement(By.id("search_input")).clear(); 45 driver.findElement(By.id("search_input")).sendKeys("java"); 46 driver.findElement(By.id("search_button")).click(); 47 Thread.sleep(2000); 48 //獲取下一頁按鈕 49 WebElement next = driver.findElement(By.className("lg-pagination-next")); 50 while(next != null && next.isEnabled()==true) { //按鈕存在且可點選 51 //搜尋過程 52 53 List<WebElement> all = driver.findElements(By.className("item-top__1Z3Zo")); //是findElements,多個元素集合 54 55 for(WebElement a : all) { 56 //獲得職位 57 job = a.findElement(By.className("p-top__1F7CL")) 58 .findElement(By.tagName("a")).getText(); 59 //獲得公司 60 company = a.findElement(By.className("company-name__2-SjF")) 61 .findElement(By.tagName("a")).getText(); 62 //獲得薪資、經驗 63 money = a.findElement(By.className("p-bom__JlNur")) 64 .findElement(By.tagName("span")).getText(); 65 whole = a.findElement(By.className("p-bom__JlNur")).getText(); 66 experience = whole.substring(money.length(), whole.length()); 67 //輸出到工作表 68 row = sheet.createRow(i); 69 row.createCell(0).setCellValue(job); 70 row.createCell(1).setCellValue(company); 71 row.createCell(2).setCellValue(money); 72 row.createCell(3).setCellValue(experience); 73 //System.out.println("公司: "+company+"職位:"+job+"薪資: "+money+"經驗: "+experience); 74 i++; 75 76 } 77 //點選下一頁 78 next.click(); 79 Thread.sleep(3000); 80 j++; //由於搜尋過多頁會彈出登入框,只搜素10頁,j控制頁數 81 if(j<=10) { 82 next = driver.findElement(By.className("lg-pagination-next")); 83 }else { 84 break; 85 } 86 } 87 88 89 try { 90 //建立輸出流,將工作表寫入到檔案lagouwang.xls 91 File file = new File("E:/lagouwang_1.xls"); 92 file.createNewFile(); 93 FileOutputStream ot = new FileOutputStream(file); 94 workbook.write(ot); 95 }catch(Exception e) { 96 e.printStackTrace(); 97 } 98 99 } 100 101 }