VC6.0 使用mshtml解析html
測試用例
<html><head><title>
Just a Test
</title></head><body>
gaofeng hello!!
<div><table bgcolor="red"><tr><td bgcolor="yellow" border="2">Name</td><td id="qualify1" border="1" class="blueBorder" bgcolor=blue></td></tr><
標頭檔案:
#include <iostream>
#include <
#include <mshtml.h>
#include <string>
#include <fstream>
#include <vector>
#include <map>
#import <mshtml.tlb> no_auto_exclude
程式碼:
// TestMSHTML.cpp : 定義控制檯應用程式的入口點。
//
#include "stdafx.h"
#include "TestMSHTML.h"
#ifdef _DEBUG
#define new DEBUG_NEW#endif// 唯一的應用程式物件
CWinApp theApp;
FILE * fout;
usingnamespace std;
//OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>");
typedef int BorderAttribute;
void FindAllElementHavingBg(IHTMLDocument2 * pNewDoc,map<BorderAttribute,IHTMLElement *>& borderValue2ElementMap)
{
IHTMLElement * pBody;
pNewDoc->get_body(&pBody);
pBody->Release();
}
void PrintTabs(int n)
{
for (int i =0;i<n;i++)
{
//cout << '\t'; fwprintf(fout,_T("\t"));
}
}
void VisitNode(IHTMLElement* pElement,int level)
{
BSTR strName,strId,strTag;
PrintTabs(level);
pElement->get_className(&strName);
pElement->get_id(&strId);
pElement->get_tagName(&strTag);
if (strTag!=NULL)
{
fwprintf(fout,_T("TagName:%s "),strTag);
}
if (strName!=NULL)
{
fwprintf(fout,_T("className:%s "),strName);
}
if (strId != NULL)
{
fwprintf(fout,_T("Id:%s "),strId);
}
SysFreeString(strName);
SysFreeString(strId);
SysFreeString(strTag);
BSTR strAttrName1 = _T("border");
BSTR strAttrName2 = _T("bgcolor");
VARIANT val;
pElement->getAttribute(strAttrName1,2,&val);
if (val.vt != VT_NULL)
{
if (val.bstrVal != NULL)
{
fwprintf(fout,_T("border:%s "),val.bstrVal);
}
}
pElement->getAttribute(strAttrName2,2,&val);
if (val.vt != VT_NULL)
{
if (val.bstrVal != NULL)
{
fwprintf(fout,_T("bgcolor:%s "),val.bstrVal);
}
}
fwprintf(fout,_T("\n"));
}
//將DOM樹打印出來void Run(IHTMLElement * pElement,int level)
{
IHTMLElementCollection * children;
VisitNode(pElement,level);
IDispatch* pDisp;
pElement->get_children(&pDisp);
pDisp->QueryInterface(IID_IHTMLElementCollection,(void**)&children);
pDisp->Release();
long len;
children->get_length(&len);
VARIANT dummy;
dummy.vt = VT_I4;
for (int i =0;i < len;i++)
{
IHTMLElement* child;
dummy.intVal = i;
children->item(dummy,dummy,(IDispatch**)&pDisp);
pDisp->QueryInterface(IID_IHTMLElement,(void**)&child);
pDisp->Release();
Run(child,level +1);
child->Release();
}
children->Release();
}
void TestParse(IHTMLDocument2 * pNewDoc)
{
BSTR strText;
IHTMLElement *pBody;
pNewDoc->get_body(&pBody);
pBody->get_innerText(&strText);
wprintf(_T("%s\n"),strText);
SysFreeString(strText);
pNewDoc->get_title(&strText);
wprintf(_T("%s\n"),strText);
SysFreeString(strText);
cout <<"Run begin...."<<endl;
Run(pBody,0);
cout <<"Run end...."<<endl;
pBody->Release();
//FindAllElementHavingBg(pNewDoc);
}
void TestMSHTML(wchar_t * wcontent)
{
IHTMLDocument2 *pDoc = NULL;
CoInitialize(NULL);
CoCreateInstance(CLSID_HTMLDocument,
NULL,
CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2,
(LPVOID *) &pDoc);
if (pDoc)
{
IPersistStreamInit *pPersist = NULL;
pDoc->QueryInterface(IID_IPersistStreamInit,
(LPVOID *) &pPersist);
if (pPersist)
{
IMarkupServices *pMS = NULL;
pPersist->InitNew();
pPersist->Release();
pDoc->QueryInterface(IID_IMarkupServices,
(LPVOID *) &pMS);
if (pMS)
{
IMarkupContainer *pMC = NULL;
IMarkupPointer *pMkStart = NULL;
IMarkupPointer *pMkFinish = NULL;
pMS->CreateMarkupPointer(&pMkStart);
pMS->CreateMarkupPointer(&pMkFinish);
pMS->ParseString(wcontent,
0,
&pMC,
pMkStart,
pMkFinish);
if (pMC)
{
IHTMLDocument2 *pNewDoc = NULL;
pMC->QueryInterface(IID_IHTMLDocument,
(LPVOID *) &pNewDoc);
if (pNewDoc)
{
// do anything with pNewDoc, in this case
// get the body innerText. TestParse(pNewDoc);
pNewDoc->Release();
}
pMC->Release();
}
if (pMkStart)
pMkStart->Release();
if (pMkFinish)
pMkFinish->Release();
pMS->Release();
}
}
pDoc->Release();
}
CoUninitialize();
}
inline wchar_t* AnsiToUnicode( constchar* szStr )
{
int nLen = MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, NULL, 0 );
if (nLen ==0)
{
return NULL;
}
wchar_t* pResult =new wchar_t[nLen+1];
MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, pResult, nLen );
pResult[nLen] = L'\0';
return pResult;
}
//呼叫者負責delete wcontentwchar_t * ReadFromHtmlFile(string str,string& content)
{
ifstream fin(str.c_str());
string line;
while(getline(fin,line))
{
// cout << line << endl; content = content + line;
}
//cout << content << endl;
//cout << content.size() << endl;
//printf("original html code\n%s\n",content.c_str()); wchar_t * wcontent = AnsiToUnicode(content.c_str());
//wprintf(L"after transferred\n%s\n",wcontent);
//delete[] wcontent; fin.close();
fin.clear();
return wcontent;
}
int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode =0;
// 初始化 MFC 並在失敗時顯示錯誤if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: 更改錯誤程式碼以符合您的需要 _tprintf(_T("錯誤: MFC 初始化失敗\n"));
nRetCode =1;
}
else
{
fout = fopen("out.txt","w");
string str ="test.html";
string content;
wchar_t * wcontent = ReadFromHtmlFile(str,content);
int len = wcslen(wcontent);
//cout << len << endl;
TestMSHTML(wcontent);
delete[] wcontent;
fclose(fout);
}
return nRetCode;
}
輸出結果:
TagName:BODY
TagName:DIV
TagName:TABLE bgcolor:#ff0000
TagName:TBODY
TagName:TR
TagName:TD border:2 bgcolor:#ffff00
TagName:TD className:blueBorder Id:qualify1 border:1 bgcolor:#0000ff
TagName:TR
TagName:TD
TagName:P className:blueBorder Id:qualify2 border:1 bgcolor:blue
TagName:TD
TagName:TR
TagName:TD
TagName:TD
電腦使用WIN7,根據以上程式碼我修改出了自己的程式碼,但是在vc6中編譯時提示: IID_IMarkupServices未定義,於是開啟標頭檔案:C:\Program Files\Microsoft Visual Studio\VC98\Include\MSHTML.H (vc6中存放位置)與vs2008的標頭檔案進行對比,發現vc6中IID_IMarkupServices未定義,再開啟OLE/COM Object Viewer(Microsoft Visual Studio 6.0->Microsoft Visual Studio 6.0 Tools->OLE Tools)->Type Libraries,找到Microsoft HTML Object Library(Ver 4.0), 搜尋IMarkupServices2 找到如下內容:
[
odl,
uuid(3050F682-98B5-11CF-BB82-00AA00BDCE0B)
]
interface IMarkupServices2 : IMarkupServices {
HRESULT _stdcall ParseGlobalEx(
[in] wireHGLOBAL hglobalHTML,
[in] unsigned long dwFlags,
[in] IMarkupContainer* pContext,
[out] IMarkupContainer** ppContainerResult,
[in] IMarkupPointer* pPointerStart,
[in] IMarkupPointer* pPointerFinish);
HRESULT _stdcall ValidateElements(
[in] IMarkupPointer* pPointerStart,
[in] IMarkupPointer* pPointerFinish,
[in] IMarkupPointer* pPointerTarget,
[in, out] IMarkupPointer* pPointerStatus,
[out] IHTMLElement** ppElemFailBottom,
[out] IHTMLElement** ppElemFailTop);
HRESULT _stdcall SaveSegmentsToClipboard(
[in] ISegmentList* pSegmentList,
[in] unsigned long dwFlags);
};
說明在WIN7下面IMarkupServices2的 地址是3050F682-98B5-11CF-BB82-00AA00BDCE0B,
所以在我的檔案裡面添加了如下程式碼:
extern "C" const GUID __declspec(selectany) IID_IMarkupServices =
{0x3050F682,0x98B5,0x11CF,{0xBB,0x82,0x00,0xAA,0x00,0xBD,0xCE,0x0B}};
再次編譯就OK了;
解析的時候注意要把open的網頁轉換為 wchar_t 型別的