Linux獲取網頁原始碼的幾種方法 linux爬蟲程式
阿新 • • 發佈:2018-11-15
分享一下我老師大神的人工智慧教程!零基礎,通俗易懂!http://blog.csdn.net/jiangjunshow
也歡迎大家轉載本篇文章。分享知識,造福人民,實現我們中華民族偉大復興!
第一個為利用linux下的工具來獲取網頁原始碼,我用的是Wget,也可以使用Curl,curl的話更加的靈活,可以設定很多引數C++程式碼
- //通過Wget來獲取網頁
- string GetHtmlByWget(string url)
- {
- //獲取待下載網頁檔名
- string fileName = url.substr((int)url.find_last_of("/") + 1);
-
- {
- string strCom = "wget -q "; //wget命令,-q表示不顯示下載資訊
- strCom.append(url);
- system(strCom.c_str()); //執行wget
- ifstream fin(fileName.c_str());
- if(!fin)
- {
- return "";
- }
- string strHtml = "";
- char chTemp[1024] = "";
- //讀取網頁檔案到記憶體中
- while(fin.getline(chTemp , 1024))
- {
- strHtml.append(string(chTemp));
- strcpy(chTemp , "");
- }
- fin.close();
- strCom = "rm -f "; //刪除檔案命令,-f表示直接刪除不做任何提示
- strCom.append(fileName);
- system(strCom.c_str()); //刪除剛才下載下來的檔案
- return strHtml; //返回網頁原始碼
- }
- else
- {
- return "";
- }
- }
- //通過Wget來獲取網頁
- string GetHtmlByWget(string url)
- {
- //獲取待下載網頁檔名
- string fileName = url.substr((int)url.find_last_of("/") + 1);
- if(fileName != "")
- {
- string strCom = "wget -q "; //wget命令,-q表示不顯示下載資訊
- strCom.append(url);
- system(strCom.c_str()); //執行wget
- ifstream fin(fileName.c_str());
- if(!fin)
- {
- return "";
- }
- string strHtml = "";
- char chTemp[1024] = "";
- //讀取網頁檔案到記憶體中
- while(fin.getline(chTemp , 1024))
- {
- strHtml.append(string(chTemp));
- strcpy(chTemp , "");
- }
- fin.close();
- strCom = "rm -f "; //刪除檔案命令,-f表示直接刪除不做任何提示
- strCom.append(fileName);
- system(strCom.c_str()); //刪除剛才下載下來的檔案
- return strHtml; //返回網頁原始碼
- }
- else
- {
- return "";
- }
- }
//通過Wget來獲取網頁string GetHtmlByWget(string url){ //獲取待下載網頁檔名 string fileName = url.substr((int)url.find_last_of("/") + 1); if(fileName != "") { string strCom = "wget -q "; //wget命令,-q表示不顯示下載資訊 strCom.append(url); system(strCom.c_str()); //執行wget ifstream fin(fileName.c_str()); if(!fin) { return ""; } string strHtml = ""; char chTemp[1024] = ""; //讀取網頁檔案到記憶體中 while(fin.getline(chTemp , 1024)) { strHtml.append(string(chTemp)); strcpy(chTemp , ""); } fin.close(); strCom = "rm -f "; //刪除檔案命令,-f表示直接刪除不做任何提示 strCom.append(fileName); system(strCom.c_str()); //刪除剛才下載下來的檔案 return strHtml; //返回網頁原始碼 } else { return ""; }}
第二個是用的socket的來獲取原始碼
C++程式碼
- //通過GET獲取網頁原始碼
- string GetHtmlByGet(string url)
- {
- string strHtmlContent = "";
- int sockfd;
- struct sockaddr_in addr;
- struct hostent *pURL;
- char text[RECVBUF];
- //分析連結
- UrlInfo urlInfo = ParseURL(url);
- string sAccept = "Accept: */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate";
- //不同的主機UserAgent不同
- string sUserAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10";
- //將埠轉換為字串
- char t[6];
- string strPort;
- sprintf(t,"%d", urlInfo.Port);
- strPort = t;
- //構造傳送字串
- string strRequest = "";
- strRequest.append("GET ");
- strRequest.append(urlInfo.File);
- strRequest.append("?");
- strRequest.append(urlInfo.Body);
- strRequest.append(" HTTP/1.1\r\n");
- strRequest.append(sAccept);
- strRequest.append("\r\nUser-Agent:");
- strRequest.append(sUserAgent);
- strRequest.append("\r\nHost:");
- strRequest.append(urlInfo.Host);
- strRequest.append(":");
- strRequest.append(strPort);
- strRequest.append("\r\nConnection: Keep-Alive\r\n\r\n");
- char* host = const_cast<char*>(urlInfo.Host.c_str());
- sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); //TCP方式傳送
- pURL = gethostbyname(host);
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr);
- addr.sin_port = htons(80);
- //連線
- connect(sockfd,(struct sockaddr *)&addr,sizeof(addr));
- //傳送
- send(sockfd, const_cast<char*>(strRequest.c_str()), strRequest.length(), 0);
- //接受
- while(recv(sockfd, text, RECVBUF, 0) > 0)
- {
- strHtmlContent.append(text);
- bzero(text,RECVBUF);
- }
- //關閉socket
- close(sockfd);
- //返回接受結果
- return strHtmlContent;
- }
- //通過GET獲取網頁原始碼
- string GetHtmlByGet(string url)
- {
- string strHtmlContent = "";
- int sockfd;
- struct sockaddr_in addr;
- struct hostent *pURL;
- char text[RECVBUF];
- //分析連結
- UrlInfo urlInfo = ParseURL(url);
- string sAccept = "Accept: */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate";
- //不同的主機UserAgent不同
- string sUserAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10";
- //將埠轉換為字串
- char t[6];
- string strPort;
- sprintf(t,"%d", urlInfo.Port);
- strPort = t;
- //構造傳送字串
- string strRequest = "";
- strRequest.append("GET ");
- strRequest.append(urlInfo.File);
- strRequest.append("?");
- strRequest.append(urlInfo.Body);
- strRequest.append(" HTTP/1.1\r\n");
- strRequest.append(sAccept);
- strRequest.append("\r\nUser-Agent:");
- strRequest.append(sUserAgent);
- strRequest.append("\r\nHost:");
- strRequest.append(urlInfo.Host);
- strRequest.append(":");
- strRequest.append(strPort);
- strRequest.append("\r\nConnection: Keep-Alive\r\n\r\n");
- char* host = const_cast<char*>(urlInfo.Host.c_str());
- sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); //TCP方式傳送
- pURL = gethostbyname(host);
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr);
- addr.sin_port = htons(80);
- //連線
- connect(sockfd,(struct sockaddr *)&addr,sizeof(addr));
- //傳送
- send(sockfd, const_cast<char*>(strRequest.c_str()), strRequest.length(), 0);
- //接受
- while(recv(sockfd, text, RECVBUF, 0) > 0)
- {
- strHtmlContent.append(text);
- bzero(text,RECVBUF);
- }
- //關閉socket
- close(sockfd);
- //返回接受結果
- return strHtmlContent;
- }
//通過GET獲取網頁原始碼string GetHtmlByGet(string url){ string strHtmlContent = ""; int sockfd; struct sockaddr_in addr; struct hostent *pURL; char text[RECVBUF]; //分析連結 UrlInfo urlInfo = ParseURL(url); string sAccept = "Accept: */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate"; //不同的主機UserAgent不同 string sUserAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10"; //將埠轉換為字串 char t[6]; string strPort; sprintf(t,"%d", urlInfo.Port); strPort = t; //構造傳送字串 string strRequest = ""; strRequest.append("GET "); strRequest.append(urlInfo.File); strRequest.append("?"); strRequest.append(urlInfo.Body); strRequest.append(" HTTP/1.1\r\n"); strRequest.append(sAccept); strRequest.append("\r\nUser-Agent:"); strRequest.append(sUserAgent); strRequest.append("\r\nHost:"); strRequest.append(urlInfo.Host); strRequest.append(":"); strRequest.append(strPort); strRequest.append("\r\nConnection: Keep-Alive\r\n\r\n"); char* host = const_cast<char*>(urlInfo.Host.c_str()); sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); //TCP方式傳送 pURL = gethostbyname(host); addr.sin_family = AF_INET; addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr); addr.sin_port = htons(80); //連線 connect(sockfd,(struct sockaddr *)&addr,sizeof(addr)); //傳送 send(sockfd, const_cast<char*>(strRequest.c_str()), strRequest.length(), 0); //接受 while(recv(sockfd, text, RECVBUF, 0) > 0) { strHtmlContent.append(text); bzero(text,RECVBUF); } //關閉socket close(sockfd); //返回接受結果 return strHtmlContent;}
使用libcurl
Java程式碼
- #include <stdio.h>
- #include <string.h>
- #include <curl/curl.h>
- #define MAX_BUF 65536
- char wr_buf[MAX_BUF+1];
- int wr_index;
- /*
- * Write data callback function (called within the context of
- * curl_easy_perform.
- */
- size_t write_data( void *buffer, size_t size, size_t nmemb, void *userp )
- {
- int segsize = size * nmemb;
- /* Check to see if this data exceeds the size of our buffer. If so,
- * set the user-defined context value and return 0 to indicate a
- * problem to curl.
- */
- if ( wr_index + segsize > MAX_BUF ) {
- *(int *)userp = 1;
- return 0;
- }
- /* Copy the data from the curl buffer into our buffer */
- memcpy( (void *)&wr_buf[wr_index], buffer, (size_t)segsize );
- /* Update the write index */
- wr_index += segsize;
- /* Null terminate the buffer */
- wr_buf[wr_index] = 0;
- /* Return the number of bytes received, indicating to curl that all is okay */
- return segsize;
- }
- /*
- * Simple curl application to read the index.html file from a Web site.
- */
- int main( void )
- {
- CURL *curl;
- CURLcode ret;
- int wr_error;
- wr_error = 0;
- wr_index = 0;
- /* First step, init curl */
- curl = curl_easy_init();
- if (!curl) {
- printf("couldn't init curl\n");
- return 0;
- }
- /* Tell curl the URL of the file we're going to retrieve */
- curl_easy_setopt( curl, CURLOPT_URL, "www.exampledomain.com" );
- /* Tell curl that we'll receive data to the function write_data, and
- * also provide it with a context pointer for our error return.
- */
- curl_easy_setopt( curl, CURLOPT_WRITEDATA, (void *)&wr_error );
- curl_easy_setopt( curl, CURLOPT_WRITEFUNCTION, write_data );
- /* Allow curl to perform the action */
- ret = curl_easy_perform( curl );
- printf( "ret = %d (write_error = %d)\n", ret, wr_error );
- /* Emit the page if curl indicates that no errors occurred */
- if ( ret == 0 ) printf( "%s\n", wr_buf );
- curl_easy_cleanup( curl );
- return 0;
- }
- http://yang7229693.iteye.com/blog/855208