網站採集的一個小程式
去年參加杭州馬拉松後,一直想知道參賽的具體人數,官網上滿足不了我這個要求,於是自己寫了個簡單的程式,可以做的功能:
1、下載網頁。
2、解析網頁,將參賽選手的名次、成績、姓名等資訊解析出來。
3、按照一定的規則排序。
待加強的部分:
1、異常處理。
2、下載網頁的時候,是可以設定編碼方式的,這樣處理漢字的時候更方便。同事告訴我的,待進一步測試驗證。
3、排序。可以寫個外部排序的程式。支援大資料量的查詢。
makefile:
downParseHtml: downParseHtml.cc
g++ -g -o downParseHtml downParseHtml.cc -lcurl
clean:
rm -f downParseHtml
執行的指令碼:
#!/bin/sh
./downParseHtml 終點成績 男子半程
downParseHtml.h程式碼如下:
#ifndef _DOWNPARSEHTML_INCLUDE
#define _DOWNPARSEHTML_INCLUDE
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <iostream>
#include "curl/curl.h"
#include <vector>
#include <string>
#include <map>
#include <dirent.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
using namespace std;
#define READ_BUF_SIZE 1024
/*
程式中需要進一步考慮出錯的情況
*/
/*
基礎庫
*/
class cBasicLibrary
{
public:
cBasicLibrary();
~cBasicLibrary();
int str2i(const string &szSource);
int vSplitString(const string &szSrc ,vector<string>& vecDest , char chSeparator, int iMaxNum);
string i2str(const int i);
bool i2strEspecial(const int i, string &sOutput);
private:
char m_cBuf[10];
};
/*
下載網頁
*/
class cDownHtml
{
public:
cDownHtml();
~cDownHtml();
void getAllUrl();
private:
string m_szUrl;
cBasicLibrary *m_cBasicLibrary;
string m_szIpAndPort;
string m_szSavePath;
bool getUrl(const char *filename, const int iNumOfPlayer);
void GetRulFromServiceBatch(const int iStart, const int iEnd);
bool GetRulFromServiceBatchSmallNum(const int iStart, const int iEnd);
};
/*
排序
*/
typedef struct ST_Player_Info
{
string szRanking; //排名
string szNum; //編號
string szName; //姓名
string szAddress; //聯絡方式-地址
string szType; //參賽類別
string szGrade; //成績
}Player_InfoT;
class cCompositor
{
public:
cCompositor();
~cCompositor();
void printfResult();
map<int,Player_InfoT> m_AllPayerData;
Player_InfoT m_Player_InfoT;
private:
};
/*
解析結果
*/
class cParseHtml
{
public:
cParseHtml();
~cParseHtml();
bool OpenDir(const string &szFileName);
cCompositor *m_cCompositor;
private:
bool IsDIR(const string &szPathAndName);
void ParseKeyLine(const string &szKeyLine);
void ParseGetHtmlFile(const string &szFilePathAndName);
private:
cBasicLibrary *m_cBasicLibrary;
};
#endif
downParseHtml.cc程式碼如下:
#include "downParseHtml.h"
using namespace std;
string gszKeyword; // 參賽的類別
string gszName; //解析中通過此關鍵字進行匹配確認頁面是否有效
string gszPlayerNum; //運動員參賽號碼
#define DOWNLOADURL "http://www.hzim.org/sign/score.php?type=csh&idnum="
#define DOWNLOADIPANDPORT "122.225.106.163:80"
#define HTMLSAVEPATH "/data/tom/"
cBasicLibrary::cBasicLibrary()
{
memset(m_cBuf, '\0',sizeof(m_cBuf));
}
cBasicLibrary::~cBasicLibrary()
{
;
}
int cBasicLibrary::str2i(const string &szSource)
{
static int iResult;
iResult = atol(szSource.c_str());
return iResult;
}
/*
aaa,bbb ->分成aaa 和bbb
*/
int cBasicLibrary::vSplitString(const string &szSrc ,vector<string>& vecDest,
char chSeparator, int iMaxNum)
{
if(szSrc.empty())
return 0;
int nTotalSplit = 0;
string::size_type size_pos = 0;
string::size_type size_prev_pos = 0;
while ((size_pos = szSrc.find_first_of(chSeparator, size_pos)) != string::npos)
{
string strTemp = szSrc.substr(size_prev_pos , size_pos-size_prev_pos);
vecDest.push_back(strTemp);
nTotalSplit++;
size_prev_pos = ++size_pos;
if(iMaxNum > 0 && nTotalSplit >= iMaxNum)
return nTotalSplit;
}
string strTemp1 = szSrc.substr(size_prev_pos , szSrc.length() - size_prev_pos );
vecDest.push_back(strTemp1);
nTotalSplit++;
return nTotalSplit;
};
string cBasicLibrary::i2str(const int i) // change to use fprintf()
{
memset(m_cBuf,'\0',sizeof(m_cBuf));
sprintf(m_cBuf, "%d", i);
return string(m_cBuf);
}
bool cBasicLibrary::i2strEspecial(const int i, string &sOutput)
{
int iReturnFlag = -1;
sOutput.clear();
if ((i >= 0) && (i <= 9))
{
sOutput = "0000";
}
else if ((i >= 10) && (i <= 99))
{
sOutput = "000";
}
else if ((i >= 100) && (i <= 999))
{
sOutput = "00";
}
else if ((i >= 1000) && (i <= 9999))
{
sOutput = "0";
}
else
{
iReturnFlag = 0;
}
if (iReturnFlag == 0)
return false;
else
{
memset(m_cBuf,'\0',sizeof(m_cBuf));
sprintf(m_cBuf, "%d", i);
sOutput += string(m_cBuf);
return true;
}
}
cDownHtml::cDownHtml()
{
m_cBasicLibrary = new cBasicLibrary();
m_szUrl = DOWNLOADURL;
m_szIpAndPort = DOWNLOADIPANDPORT;
m_szSavePath = HTMLSAVEPATH;
}
cDownHtml::~cDownHtml()
{
if (m_cBasicLibrary != NULL)
delete m_cBasicLibrary;
}
bool cDownHtml::getUrl(const char *filename, const int iNumOfPlayer)
{
if ((iNumOfPlayer < 0)
|| (filename == NULL))
return false;
FILE *fp;
if ((fp = fopen(filename, "w")) == NULL)
return false;
struct curl_slist *headers = NULL;
string szUrl;
CURL *curl;
CURLcode res;
if (iNumOfPlayer >= 10000)
{
szUrl = m_szUrl + m_cBasicLibrary->i2str(iNumOfPlayer);
}
else
{
string szNum;
m_cBasicLibrary->i2strEspecial(iNumOfPlayer,szNum);
szUrl = m_szUrl + szNum;
}
headers = curl_slist_append(headers,"Accept: Agent-007");
curl = curl_easy_init();
if (curl)
{
curl_easy_setopt(curl, CURLOPT_PROXY, m_szIpAndPort.c_str());
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
curl_easy_setopt(curl, CURLOPT_URL, szUrl.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
res = curl_easy_perform(curl);
if (res != 0)
cout<<"res is "<<res<<endl;
curl_slist_free_all(headers);
curl_easy_cleanup(curl);
}
fclose(fp);
return true;
}
void cDownHtml::GetRulFromServiceBatch(const int iStart, const int iEnd)
{
int i,itmp;
string sFileName,szShell,szPathName;
string sFilePath = m_szSavePath;
if (iStart < 10000)
return;
for (i = iStart; i <= iEnd; i++)
{
itmp = i / 10000;
szPathName = m_szSavePath + m_cBasicLibrary->i2str(itmp) + "/";
if (0 == (i % 200))
{
sFilePath = szPathName;
sFilePath += m_cBasicLibrary->i2str(i);
szShell = "mkdir " + sFilePath;
system(szShell.c_str());
sleep(3);
}
sFileName = sFilePath + "/" + m_cBasicLibrary->i2str(i) + ".html";
if (!getUrl(sFileName.c_str(), i))
cout<<"sFileName is "<<sFileName<<" get url err!"<<endl;
}
return;
}
bool cDownHtml::GetRulFromServiceBatchSmallNum(const int iStart, const int iEnd)
{
if ((iStart > 9999)
|| (iStart < 0)
|| (iEnd > 9999))
{
return false;
}
int i = 0;
string sFileName,szShell,sNum;
string sFilePath = m_szSavePath;
//0-9999
sFilePath = m_szSavePath + "/0/";
for (i = iStart; i <= iEnd; i++)
{
if (0 == (i % 200))
{
sFilePath = m_szSavePath + "/0/";
m_cBasicLibrary->i2strEspecial(i,sNum);
sFilePath += sNum;
szShell = "mkdir " + sFilePath;
system(szShell.c_str());
sleep(3);
}
m_cBasicLibrary->i2strEspecial(i,sNum);
sFileName = sFilePath + "/" + sNum + ".html";
if (!getUrl(sFileName.c_str(), i))
cout<<"sFileName is "<<sFileName<<" get url err!"<<endl;
}
return true;
}
//將小於9999的分解出來
void cDownHtml::getAllUrl()
{
GetRulFromServiceBatchSmallNum(0,9999);
GetRulFromServiceBatch(10000,99999);
return;
}
cParseHtml::cParseHtml()
{
m_cBasicLibrary = new cBasicLibrary();
m_cCompositor = new cCompositor();
}
cParseHtml::~cParseHtml()
{
if (m_cBasicLibrary != NULL)
delete m_cBasicLibrary;
if (m_cCompositor != NULL)
delete m_cCompositor;
}
//string szKeyLine = "</tr><tr><th>彭輝 </th><td>湖南 湘西土家族苗族自治州 </td><td>2154</td><td>男子全程</td><td>05:06:28</td></tr></table>";
void cParseHtml::ParseKeyLine(const string &szKeyLine)
{
if (szKeyLine.size() <= 0)
return;
vector<string> vPlayerNum;
m_cBasicLibrary->vSplitString(gszPlayerNum , vPlayerNum, '.', 0);
if (vPlayerNum.size() > 0)
{
gszPlayerNum = vPlayerNum[0];
}
else
gszPlayerNum.clear();
vector<string> vMessage;
int i = 0, j = 0;
string szOutput;
for (i = 0; i < szKeyLine.size(); i++)
{
if ((*(szKeyLine.c_str() + i)) == '<')
{
for (j = i + 1; j < szKeyLine.size(); j++)
{
if ((*(szKeyLine.c_str() + j)) == '>')
{
i = j;
if (szOutput.size() > 0)
{
vMessage.push_back(szOutput);
szOutput.clear();
}
break;
}
}
if (j >= szKeyLine.size())
break;
}
else
{
szOutput += *(szKeyLine.c_str() + i);
}
}
if (vMessage.size() > 3)
{
/*
cout<<vMessage[3]<<","<<gszPlayerNum<<",";
//i = 0時候為空
for(i = 1; i < vMessage.size(); i++)
{
if (i == 3)
continue;
if (i == vMessage.size() - 1)
cout<<vMessage[i];
else
cout<<vMessage[i]<<",";
}
cout<<endl;
*/
m_cCompositor->m_Player_InfoT.szNum = gszPlayerNum;
m_cCompositor->m_Player_InfoT.szAddress = vMessage[2];
m_cCompositor->m_Player_InfoT.szGrade = vMessage[5];
m_cCompositor->m_Player_InfoT.szName = vMessage[1];
m_cCompositor->m_Player_InfoT.szRanking = vMessage[3];
m_cCompositor->m_Player_InfoT.szType = vMessage[4];
m_cCompositor->m_AllPayerData.insert(map<int,Player_InfoT>::value_type(m_cBasicLibrary->str2i(vMessage[3]), m_cCompositor->m_Player_InfoT));
}
return;
}
void cParseHtml::ParseGetHtmlFile(const string &szFilePathAndName)
{
if (szFilePathAndName.size() <= 0)
return;
int i;
FILE *status;
char buffer[READ_BUF_SIZE];
string szOneLineFromFile;
string::size_type position;
if (!(status = fopen(szFilePathAndName.c_str(), "r")))
{
fclose(status);
return;
}
while (fgets(buffer, READ_BUF_SIZE-1, status) != NULL)
{
szOneLineFromFile = string(buffer);
position = szOneLineFromFile.find(gszName.c_str());
if (position != szOneLineFromFile.npos)
{
if (fgets(buffer, READ_BUF_SIZE-1, status) != NULL)
{
szOneLineFromFile = string(buffer);
if (gszKeyword.size() > 0)
{
position = szOneLineFromFile.find(gszKeyword.c_str());
if (position != szOneLineFromFile.npos)
{
ParseKeyLine(szOneLineFromFile);
}
}
else
{
ParseKeyLine(szOneLineFromFile);
}
}
else
break;
}
memset(buffer,'\0', sizeof(buffer));
}
fclose(status);
return;
}
bool cParseHtml::IsDIR(const string &szPathAndName)
{
struct stat sb;
if (stat(szPathAndName.c_str(), &sb) == -1)
return false;
return S_ISDIR(sb.st_mode);
}
bool cParseHtml::OpenDir(const string &szFileName)
{
if (szFileName.size() <= 0)
return false;
DIR *dp;
struct dirent *dirp;
string FileName;
if((dp = opendir(szFileName.c_str())) == NULL)
{
perror("opendir error");
return false;
}
while ((dirp = readdir(dp))!=NULL)
{
if((strcmp(dirp->d_name,".")==0)||(strcmp(dirp->d_name,"..")==0))
continue;
/*
判斷是否是資料夾,但是在nfs系統下,
dirp->d_type 的值讀不出來,需要用其它的方法。
*/
//if (dirp->d_type == DT_DIR)
FileName = szFileName;
FileName += string(dirp->d_name);
gszPlayerNum = string(dirp->d_name);
if (IsDIR(FileName))
{
string szFilePathAndName = FileName;
szFilePathAndName += "/";
OpenDir(szFilePathAndName);//遞迴的過程
}
else //檔案
{
ParseGetHtmlFile(FileName);
}
}
closedir(dp);
return true;
}
cCompositor::cCompositor()
{
m_AllPayerData.clear();
m_Player_InfoT.szAddress.clear();
m_Player_InfoT.szGrade.clear();
m_Player_InfoT.szName.clear();
m_Player_InfoT.szNum.clear();
m_Player_InfoT.szRanking.clear();
m_Player_InfoT.szType.clear();
}
cCompositor::~cCompositor()
{
}
void cCompositor::printfResult()
{
cout<<"m_AllPayerData.size is "<<m_AllPayerData.size()<<endl;
map<int,Player_InfoT>::iterator iter = m_AllPayerData.begin();
while (m_AllPayerData.end() != iter)
{
cout<<iter->second.szRanking
<<","<<iter->second.szName
<<","<<iter->second.szGrade
<<","<<iter->second.szNum
<<","<<iter->second.szAddress
//<<","<<iter->second.szType
<<endl;
++iter;
}
return;
}
int main(int argc, char **argv)
{
if (argc < 2)
{
cout<<"argc is "<<argc<<" parameter num is too few!"<<endl;
return -1;
}
gszName = argv[1];
if (argc == 3)
gszKeyword = argv[2];
cParseHtml *ocParseHtml = new cParseHtml();
if (ocParseHtml != NULL)
{
ocParseHtml->OpenDir(HTMLSAVEPATH);
ocParseHtml->m_cCompositor->printfResult();
delete ocParseHtml;
}
return 0;
}