微醫網爬蟲(一) java實現
阿新 • • 發佈:2019-01-08
爬取微醫網醫生的基本資料,獲取每個醫生的URL之後,可以使用以下方法解析:
想要採集醫生歷史問診詳細資訊的同學可以移步我們另一篇部落格:傳送門
public Doctor getDoctorInfor(String url) { Doctor doctor = new Doctor(); //提取id doctor.setId(url.substring(30, url.length())); System.out.print("正在獲取:" + doctor.getId() + "\t"); Document doc = null; try { doc = Jsoup.connect(url).get(); if (doc != null) { Element ele1 = doc.selectFirst("div[class=detail word-break]"); if (ele1 != null) { //提取姓名與職稱 Element h1 = ele1.selectFirst("h1"); if (h1 != null) { Element s = h1.selectFirst("Strong"); Element sp = h1.selectFirst("span"); if (s != null) { doctor.setName(s.text()); System.out.print(s.text() + "\t"); } if (sp != null) { doctor.setJob(sp.text()); } } //提取是否為專家 Element isA = ele1.selectFirst("a[class=expert-group]"); if (isA != null) { doctor.setIsExpert("1"); } else { doctor.setIsExpert("0"); } //提取醫院和科室 Element hosDiv = ele1.selectFirst("div[class=hospital]"); if (hosDiv != null) { Element a1 = hosDiv.selectFirst("a"); Element a2 = hosDiv.select("a").get(1); if (a1 != null) { doctor.setHospital(a1.text()); } if (a2 != null) { doctor.setRoom(a2.text()); } } //提取擅長領域 Element goodDiv = ele1.selectFirst("div[class=goodat]"); if (goodDiv != null) { Element span1 = goodDiv.selectFirst("span"); if (span1 != null) { doctor.setGoodAt(span1.text()); } } //提取簡介 Element aboutDiv = ele1.selectFirst("div[class=about]"); if (aboutDiv != null) { Element a = aboutDiv.selectFirst("a"); if (a != null) { doctor.setSummary(a.attr("data-description")); } else { Element span = aboutDiv.selectFirst("span"); if (span != null) { doctor.setSummary(span.text()); } } } } //提取評分,問診量,預約量 Element ele2 = doc.selectFirst("div[class=status]"); if (ele2 != null) { Element dataDiv = ele2.selectFirst("div[class=data]"); if (dataDiv != null) { Elements strong = dataDiv.select("strong"); if (strong.size() == 3) { doctor.setMarks(strong.get(0).text()); doctor.setApoint(strong.get(1).text()); doctor.setAsk(strong.get(2).text()); } } } //提取關注量 Element markDiv = doc.selectFirst("div[class=summary]"); if (markDiv != null) { Element markspan = markDiv.selectFirst("span[class=mark-count]"); if (markspan != null) { doctor.setFocus(markspan.text()); } } //提取價格 Element price = doc.selectFirst("div[class=consult-type]"); if (price != null) { Elements pr = price.select("p[class=current-price]"); if (pr.size() >= 1) { String pr1 = pr.get(0).text(); doctor.setPrice1(pr1.substring(1, pr1.length())); } if (pr.size() == 2) { String pr2 = pr.get(1).text(); doctor.setPrice2(pr2.substring(1, pr2.length())); } } //提取評論數量 Element commentDiv = doc.selectFirst("section[class=grid-section grid-section-outside expert-comment]"); if (commentDiv != null) { Element tip = commentDiv.selectFirst("div[class=tip]"); if (tip != null) { Element st = tip.selectFirst("strong"); if (st != null) { doctor.setComment(st.text()); } } } //提取問診和回答的數量 Element ele3 = doc.selectFirst("section[class=grid-section grid-section-outside expert-history-ask J_ExpertHistoryAsk]"); if (ele3 != null) { Element a = ele3.selectFirst("a[class=tip]"); if (a != null) { String aurl = a.attr("href"); //getSomNum(aurl, doctor); // getSomNumb(aurl,doctor); //TODO:需要問診數量和回答數量時將此行程式碼恢復即可 } } //提取狀態資訊 Element status=doc.selectFirst("div[id=service]"); if(status!=null){ Elements as=status.select("a"); doctor.setIsGhuahao("0"); doctor.setIsTuwen("0"); doctor.setIsShihua("0"); doctor.setIsFuwu("0"); for(Element elemente:as){ String attr=elemente.attr("class"); if(attr.contains("guahao")){ doctor.setIsGhuahao(isActive(attr)); } if(attr.contains("tuwen")){ doctor.setIsTuwen(isActive(attr)); } if(attr.contains("shipin")){ doctor.setIsShihua(isActive(attr)); } if(attr.contains("servicePkg")){ doctor.setIsTuwen(isActive(attr)); } } } } } catch (IOException e) { e.printStackTrace(); } System.out.println("---->完成"); return doctor; }
需要原始碼的同學可以聯絡博主QQ(1477517404)。爬取結果: