1. 程式人生 > >VC6.0 使用mshtml解析html

VC6.0 使用mshtml解析html

測試用例

<html><head><title>
    Just a Test
</title></head><body>
gaofeng hello!!
<div><table bgcolor="red"><tr><td bgcolor="yellow" border="2">Name</td><td id="qualify1" border="1" class="blueBorder" bgcolor=blue></td></tr><

tr><td><id="qualify2" class="blueBorder" bgcolor="blue" border="1">Surname</p></td><td></td></tr><tr><td>address</td><td></td></tr></table></div></body></html>

標頭檔案:

#include <iostream>
#include 
<

comdef.h>
#include 
<mshtml.h>
#include 
<string>
#include 
<fstream>
#include 
<vector>
#include 
<map>
#import 
<mshtml.tlb> no_auto_exclude

程式碼:

// TestMSHTML.cpp : 定義控制檯應用程式的入口點。
//

#include 
"stdafx.h"
#include 
"TestMSHTML.h"
#ifdef _DEBUG
#define new DEBUG_NEW#endif// 唯一的應用程式物件


CWinApp theApp;
FILE 
* fout;
usingnamespace std;
//OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>");
typedef  
int BorderAttribute;
void FindAllElementHavingBg(IHTMLDocument2 * pNewDoc,map<BorderAttribute,IHTMLElement *>& borderValue2ElementMap)
{
    IHTMLElement 
* pBody;
    pNewDoc
->get_body(&pBody);
    pBody
->Release();
}

void PrintTabs(int n)
{
    
for (int i =0;i<n;i++)
    {
        
//cout << '\t';        fwprintf(fout,_T("\t"));
    }
}

void VisitNode(IHTMLElement* pElement,int level)
{
    BSTR strName,strId,strTag;
    PrintTabs(level);
    pElement
->get_className(&strName);
    pElement
->get_id(&strId);
    pElement
->get_tagName(&strTag);
    
if (strTag!=NULL)
    {
        fwprintf(fout,_T(
"TagName:%s "),strTag);
    }
    
if (strName!=NULL)
    {
        fwprintf(fout,_T(
"className:%s "),strName);
    }
    
if (strId != NULL)
    {
        fwprintf(fout,_T(
"Id:%s "),strId);
    }
    SysFreeString(strName);
    SysFreeString(strId);
    SysFreeString(strTag);
    BSTR strAttrName1 
= _T("border");
    BSTR strAttrName2 
= _T("bgcolor");
    VARIANT val;

    pElement
->getAttribute(strAttrName1,2,&val);
    
if (val.vt != VT_NULL)
    {
        
if (val.bstrVal != NULL)
        {
            fwprintf(fout,_T(
"border:%s "),val.bstrVal);
        }
    }


    pElement
->getAttribute(strAttrName2,2,&val);
    
if (val.vt != VT_NULL)
    {
        
if (val.bstrVal != NULL)
        {
            fwprintf(fout,_T(
"bgcolor:%s "),val.bstrVal);
        }
    }

    
    fwprintf(fout,_T(
"\n"));
}
//將DOM樹打印出來void Run(IHTMLElement * pElement,int level)
{
    IHTMLElementCollection 
* children;

    VisitNode(pElement,level);


    IDispatch
* pDisp;
    pElement
->get_children(&pDisp);
    pDisp
->QueryInterface(IID_IHTMLElementCollection,(void**)&children);
    pDisp
->Release();

    
long len;
    children
->get_length(&len);
    VARIANT dummy;
    dummy.vt 
= VT_I4;
    
for (int i =0;i < len;i++)
    {
        IHTMLElement
* child;
        dummy.intVal 
= i;
        children
->item(dummy,dummy,(IDispatch**)&pDisp);
        pDisp
->QueryInterface(IID_IHTMLElement,(void**)&child);
        pDisp
->Release();
        Run(child,level 
+1);
        child
->Release();
    }
    children
->Release();
}
void TestParse(IHTMLDocument2 * pNewDoc)
{
    BSTR strText;
    IHTMLElement 
*pBody;
    pNewDoc
->get_body(&pBody);
    pBody
->get_innerText(&strText);
    wprintf(_T(
"%s\n"),strText);
    SysFreeString(strText);
    

    pNewDoc
->get_title(&strText);
    wprintf(_T(
"%s\n"),strText);
    SysFreeString(strText);
    
    cout 
<<"Run begin...."<<endl;
    Run(pBody,
0);
    cout 
<<"Run end...."<<endl;

    pBody
->Release();

    
//FindAllElementHavingBg(pNewDoc);
}
void TestMSHTML(wchar_t * wcontent)
{
    IHTMLDocument2 
*pDoc = NULL;
    CoInitialize(NULL);
    CoCreateInstance(CLSID_HTMLDocument, 
                     NULL, 
                     CLSCTX_INPROC_SERVER, 
                     IID_IHTMLDocument2, 
                    (LPVOID 
*&pDoc);

    
if (pDoc)
    {
        IPersistStreamInit 
*pPersist = NULL;
        pDoc
->QueryInterface(IID_IPersistStreamInit, 
                             (LPVOID 
*&pPersist);
        
if (pPersist)
        {
            IMarkupServices 
*pMS = NULL;
            pPersist
->InitNew();
            pPersist
->Release();
            pDoc
->QueryInterface(IID_IMarkupServices, 
                                (LPVOID 
*&pMS);

            
if (pMS)
            {
                IMarkupContainer 
*pMC = NULL;
                IMarkupPointer 
*pMkStart = NULL;
                IMarkupPointer 
*pMkFinish = NULL;
                pMS
->CreateMarkupPointer(&pMkStart);
                pMS
->CreateMarkupPointer(&pMkFinish);
                pMS
->ParseString(wcontent,
                    
0
                    
&pMC, 
                    pMkStart, 
                    pMkFinish);

                
if (pMC)
                {
                    IHTMLDocument2 
*pNewDoc = NULL;

                    pMC
->QueryInterface(IID_IHTMLDocument, 
                        (LPVOID 
*&pNewDoc);

                    
if (pNewDoc)
                    {
                        
// do anything with pNewDoc, in this case 
                        
// get the body innerText.                        TestParse(pNewDoc);
    
                        pNewDoc
->Release();
                    }

                    pMC
->Release();
                }

                
if (pMkStart)
                    pMkStart
->Release();

                
if (pMkFinish)
                    pMkFinish
->Release();

                pMS
->Release();
            }
        }

        pDoc
->Release();
    }

    CoUninitialize();

}

inline wchar_t
* AnsiToUnicode( constchar* szStr )
{
    
int nLen = MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, NULL, 0 );
    
if (nLen ==0)
    {
        
return NULL;
    }
    wchar_t
* pResult =new wchar_t[nLen+1];
    MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, 
-1, pResult, nLen );
    pResult[nLen] 
= L'\0';
    
return pResult;
}

//呼叫者負責delete wcontentwchar_t * ReadFromHtmlFile(string str,string& content)
{
    ifstream fin(str.c_str());
    
string line;
    
while(getline(fin,line))
    {
    
//    cout << line << endl;        content = content + line;
    }
    
//cout << content << endl;
    
//cout << content.size() << endl;
    
//printf("original html code\n%s\n",content.c_str());    wchar_t * wcontent = AnsiToUnicode(content.c_str()); 
    
//wprintf(L"after transferred\n%s\n",wcontent);
    
//delete[] wcontent;    fin.close();
    fin.clear();
    
return wcontent;
}

int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
    
int nRetCode =0;

    
// 初始化 MFC 並在失敗時顯示錯誤if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
    {
        
// TODO: 更改錯誤程式碼以符合您的需要        _tprintf(_T("錯誤: MFC 初始化失敗\n"));
        nRetCode 
=1;
    }
    
else
    {
        fout 
= fopen("out.txt","w");
        
string str ="test.html";
        
string content;
        wchar_t 
* wcontent = ReadFromHtmlFile(str,content);
        
int len = wcslen(wcontent);
        
//cout << len << endl;        
        TestMSHTML(wcontent);
        delete[] wcontent;
        fclose(fout);
    }
    
    
return nRetCode;
}

輸出結果:

TagName:BODY
 TagName:DIV
  TagName:TABLE bgcolor:#ff0000
   TagName:TBODY
    TagName:TR
     TagName:TD border:2 bgcolor:#ffff00
     TagName:TD className:blueBorder Id:qualify1 border:1 bgcolor:#0000ff
    TagName:TR
     TagName:TD
      TagName:P className:blueBorder Id:qualify2 border:1 bgcolor:blue
     TagName:TD
    TagName:TR
     TagName:TD
     TagName:TD

電腦使用WIN7,根據以上程式碼我修改出了自己的程式碼,但是在vc6中編譯時提示: IID_IMarkupServices未定義,於是開啟標頭檔案:C:\Program Files\Microsoft Visual Studio\VC98\Include\MSHTML.H (vc6中存放位置)與vs2008的標頭檔案進行對比,發現vc6中IID_IMarkupServices未定義,再開啟OLE/COM Object Viewer(Microsoft Visual Studio 6.0->Microsoft Visual Studio 6.0 Tools->OLE Tools)->Type Libraries,找到Microsoft HTML Object Library(Ver 4.0), 搜尋IMarkupServices2 找到如下內容:

 [
      odl,
      uuid(3050F682-98B5-11CF-BB82-00AA00BDCE0B)
    ]
    interface IMarkupServices2 : IMarkupServices {
        HRESULT _stdcall ParseGlobalEx(
                        [in] wireHGLOBAL hglobalHTML,
                        [in] unsigned long dwFlags,
                        [in] IMarkupContainer* pContext,
                        [out] IMarkupContainer** ppContainerResult,
                        [in] IMarkupPointer* pPointerStart,
                        [in] IMarkupPointer* pPointerFinish);
        HRESULT _stdcall ValidateElements(
                        [in] IMarkupPointer* pPointerStart,
                        [in] IMarkupPointer* pPointerFinish,
                        [in] IMarkupPointer* pPointerTarget,
                        [in, out] IMarkupPointer* pPointerStatus,
                        [out] IHTMLElement** ppElemFailBottom,
                        [out] IHTMLElement** ppElemFailTop);
        HRESULT _stdcall SaveSegmentsToClipboard(
                        [in] ISegmentList* pSegmentList,
                        [in] unsigned long dwFlags);
    };

說明在WIN7下面IMarkupServices2的 地址是3050F682-98B5-11CF-BB82-00AA00BDCE0B,

所以在我的檔案裡面添加了如下程式碼:

extern "C" const GUID __declspec(selectany) IID_IMarkupServices =
    {0x3050F682,0x98B5,0x11CF,{0xBB,0x82,0x00,0xAA,0x00,0xBD,0xCE,0x0B}};

再次編譯就OK了;

解析的時候注意要把open的網頁轉換為 wchar_t 型別的