libxml2剖析(3)：使用教程

阿新 • • 發佈：2019-02-11

本文整理自官方使用教程http://xmlsoft.org/tutorial/index.html。

示例文件story.xml如下：

<?xml version="1.0"?>
<story>
  <storyinfo>
    <author>John Fleck</author>
    <datewritten>June 2, 2002</datewritten>
    <keyword>example keyword</keyword>
  </storyinfo>
  <body>
    <headline>This is the headline</headline>
    <para>This is the body text.</para>
  </body>
</story>

1、解析xml文件
解析文件時只需要文件名和一個函式呼叫，再加上錯誤處理。下面程式碼查詢keyword節點並列印節點下的文字內容，如下：

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>

/* 解析storyinfo節點，列印keyword節點的內容 */
void parseStory(xmlDocPtr doc, xmlNodePtr cur){
	xmlChar* key;
	cur=cur->xmlChildrenNode;
	while(cur != NULL){
		/* 找到keyword子節點 */
		if(!xmlStrcmp(cur->name, (const xmlChar *)"keyword")){
			key = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
			printf("keyword: %s\n", key);
			xmlFree(key);
		}
		cur=cur->next; /* 下一個子節點 */
	}

	return;
}

/* 解析文件 */
static void parseDoc(char *docname){
	/* 定義文件和節點指標 */
	xmlDocPtr doc;
	xmlNodePtr cur;
	
	/* 進行解析，如果沒成功，顯示一個錯誤並停止 */
	doc = xmlParseFile(docname);
	if(doc == NULL){
		fprintf(stderr, "Document not parse successfully. \n");
		return;
	}

	/* 獲取文件根節點，若無內容則釋放文件樹並返回 */
	cur = xmlDocGetRootElement(doc);
	if(cur == NULL){
		fprintf(stderr, "empty document\n");
		xmlFreeDoc(doc);
		return;
	}

	/* 確定根節點名是否為story，不是則返回 */
	if(xmlStrcmp(cur->name, (const xmlChar *)"story")){
		fprintf(stderr, "document of the wrong type, root node != story");
		xmlFreeDoc(doc);
		return;
	}

	/* 遍歷文件樹 */
	cur = cur->xmlChildrenNode;
	while(cur != NULL){
		/* 找到storyinfo子節點 */
		if(!xmlStrcmp(cur->name, (const xmlChar *)"storyinfo")){
			parseStory(doc, cur); /* 解析storyinfo子節點 */
		}
		cur = cur->next; /* 下一個子節點 */
	}

	xmlFreeDoc(doc); /* 釋放文件樹 */
	return;
}

int main(int argc, char **argv){
	char *docname;
	if(argc <= 1){
		printf("Usage: %s docname\n", argv[0]);
		return 0;
	}
	docname=argv[1];
	parseDoc(docname);
	return 1;
}

    解析XML文件的基本流程如下：
   （1）定義文件指標和節點指標。
   （2）呼叫xmlParseFile()解析文件。如果不成功，註冊一個錯誤並停止。一個常見錯誤是不適當的編碼。XML標準文件除了用預設的UTF-8或UTF-16外，還可顯式指定用其它編碼儲存。如果文件是這樣，libxml2將自動地為你轉換到UTF-8。更多關於XML編碼資訊包含在XML標準中。
   （3）呼叫xmlDocGetRootElement()獲取文件根節點，若無根節點則釋放文件樹並返回。
   （4）確認文件是正確的型別，通過檢查根節點名稱來判斷。
   （5）檢索節點的內容，這需要遍歷文件樹。對每個節點，遍歷其子節點都需要一個迴圈。先用cur = cur->xmlChildrenNode獲取第一個子節點，然後通過cur = cur->next不斷向前遍歷，直到cur==NULL。查詢找指定節點時使用xmlStrcmp()函式，如果你指定的名稱相同，就找到了你要的節點。通常把查詢某個子節點的過程封裝成函式。
   （6）獲取節點中的內容。查詢到指定節點後，呼叫xmlNodeListGetString()獲取節點下的文字。注意在XML中，包含在節點中的文字是這個節點的子節點，因此獲取的是cur->xmlChildrenNode中的字串。xmlNodeListGetString()會為返回的字串分配記憶體，因此記得要用xmlFree()來釋放它。
   （7）呼叫xmlFreeDoc()釋放文件樹指標。
   2、使用XPath查詢資訊

在xml文件中查詢資訊是一項核心工作。Libxml2支援使用XPath表示式來查詢匹配的節點集。簡而言之，XPath之於xml，好比SQL之於關係資料庫。要在一個複雜的xml文件中查詢所需的資訊，XPath簡直是必不可少的工具。下面程式碼查詢所有keyword元素的內容。

#include <libxml/parser.h>
#include <libxml/xpath.h>

/* 解析文件 */
xmlDocPtr getdoc(char *docname){
	xmlDocPtr doc;
	doc = xmlParseFile(docname);
	if(doc == NULL){
		fprintf(stderr, "Document not parsed successfully. \n");
		return NULL;
	}

	return doc;
}

/* 查詢節點集 */
xmlXPathObjectPtr getnodeset(xmlDocPtr doc, xmlChar *xpath){
	xmlXPathContextPtr context;
	xmlXPathObjectPtr result; /* 儲存查詢結果 */

	/* 建立一個xpath上下文 */
	context = xmlXPathNewContext(doc);
	if(context == NULL){
		printf("Error in xmlXPathNewContext\n");
		return NULL;
	}
	/* 查詢XPath表示式 */
	result = xmlXPathEvalExpression(xpath, context);
	xmlXPathFreeContext(context); /* 釋放上下文指標 */
	if(result == NULL){
		printf("Error in xmlXPathEvalExpression\n");
		return NULL;
	}
	/* 檢查結果集是否為空 */
	if(xmlXPathNodeSetIsEmpty(result->nodesetval)){
		xmlXPathFreeObject(result); /* 如為這空就釋放 */
		printf("No result\n");
		return NULL;
	}
	return result;
}

int main(int argc, char ** argv){
	char *docname;
	xmlDocPtr doc;
	/* 查詢所有keyword元素，而不管它們在文件中的位置 */
	xmlChar *xpath=(xmlChar*)"//keyword";
	xmlNodeSetPtr nodeset;
	xmlXPathObjectPtr result;
	int i;
	xmlChar *keyword;

	if(argc <= 1){
		printf("Usage: %s docname\n", argv[0]);
		return(0);
	}

	docname = argv[1];
	doc = getdoc(docname);
	result = getnodeset(doc, xpath);
	if(result){
		/* 得到keyword節點集 */
		nodeset = result->nodesetval;
		for(i=0; i < nodeset->nodeNr; i++){ /* 列印每個節點中的內容 */
			keyword = xmlNodeListGetString(doc, nodeset->nodeTab[i]->xmlChildrenNode, 1);
			printf("keyword: %s\n", keyword);
			xmlFree(keyword);
		}
		xmlXPathFreeObject(result); /* 釋放結果集 */
	}

	xmlFreeDoc(doc); /* 釋放文件樹 */
	xmlCleanupParser(); /* 清除庫記憶體 */
	return(1);
}

    可以在story.xml中多插入幾個keyword元素，然後執行一下本程式看看效果。使用XPath查詢資訊的基本流程如下：
   （1）呼叫xmlXPathNewContext()給文件樹建立一個上下文指標。
   （2）呼叫xmlXPathEvalExpression()，傳入XPath表示式和上下文指標，返回一個xmlXPathObjectPtr結果集指標。nodesetval物件包含keyword節點個數(nodeNr)和節點列表(nodeTab)。在使用之前要和xmlXPathNodeSetIsEmpty()檢查nodesetval節點列表是否為空。
   （3）遍歷節點列表nodeTab，用xmlNodeListGetString()獲取每個keyword節點的內容。
   （4）用xmlXPathFreeObject()釋放查詢結果，用xmlFreeDoc()釋放文件樹。
   更多關於Xpath的內容可以參考XPath官方規範http://www.w3.org/TR/xpath/。XPath語法的介紹，可參考w3school上的教程http://www.w3school.com.cn/xpath/index.asp，或者http://w3schools.com/xpath/default.asp。只有掌握XPath，才能掌握使用大型XML檔案獲取資訊的方法，否則每尋找一個節點都要從根節點找起，很耗時耗力。
   3、修改xml文件
   這與上面的過程類似，首先遍歷文件樹，找到要插入（或刪除）的節點處，然後插入（或刪除）相關的內容。下面程式碼在storyinfo節點下插入一個keyword元素。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>

void
parseStory(xmlDocPtr doc, xmlNodePtr cur, const xmlChar* keyword) {
	/* 在當前節點下插入一個keyword子節點 */
	xmlNewTextChild(cur, NULL, (const xmlChar*)"keyword", keyword);
    return;
}

xmlDocPtr
parseDoc(char *docname, char *keyword) {

	xmlDocPtr doc;
	xmlNodePtr cur;

	doc = xmlParseFile(docname);
	
	if (doc == NULL ) {
		fprintf(stderr,"Document not parsed successfully. \n");
		return (NULL);
	}
	
	cur = xmlDocGetRootElement(doc);
	
	if (cur == NULL) {
		fprintf(stderr,"empty document\n");
		xmlFreeDoc(doc);
		return (NULL);
	}
	
	if (xmlStrcmp(cur->name, (const xmlChar *) "story")) {
		fprintf(stderr,"document of the wrong type, root node != story");
		xmlFreeDoc(doc);
		return (NULL);
	}
	
	cur = cur->xmlChildrenNode;
	while (cur != NULL) {
		if ((!xmlStrcmp(cur->name, (const xmlChar *)"storyinfo"))){
			parseStory (doc, cur, (const xmlChar*)keyword);
		}
		 
	cur = cur->next;
	}
	return(doc);
}

int
main(int argc, char **argv) {

	char *docname;
	char *keyword;
	xmlDocPtr doc;

	if (argc <= 2) {
		printf("Usage: %s docname, keyword\n", argv[0]);
		return(0);
	}

	docname = argv[1];
	keyword = argv[2];
	doc = parseDoc(docname, keyword);
	if (doc != NULL) {
		xmlSaveFormatFile(docname, doc, 0);
		xmlFreeDoc(doc);
	}
	
	return (1);
}

這裡xmlNewTextChild函式在當前節點指標上新增一個子元素。如果希望元素有名字空間，則可以在這裡加上。新增完後，就要用xmlSaveFormatFile()把修改後的文件寫入到檔案。我們這裡使用原來doc文件指標，因此會覆蓋原來的檔案。第三個引數如果設定為1，則輸出的文件會自動縮排。
若要刪除某個節點，可以使用以下程式碼：

if(!xmlStrcmp(cur->name, BAD_CAST "keyword")){
	xmlNodePtr tempNode;
	tempNode = cur->next;
	xmlUnlinkNode(cur);
	xmlFreeNode(cur);
	cur = tempNode;
	continue;
}

注意libxml2並沒有xmlDelNode或者xmlRemoveNode之類的函式。我們需要將當前節點從文件中斷鏈（unlink），文件就不會再包含這個子節點。這樣做需要使用一個臨時變數來儲存斷鏈節點的後續節點，並記得要手動刪除斷鏈節點的記憶體。
若要給節點新增屬性，可以這樣：

xmlDocPtr
parseDoc(char *docname, char *uri) {
	xmlDocPtr doc;
	xmlNodePtr cur;
	xmlNodePtr newnode;
	xmlAttrPtr newattr;

	doc = xmlParseFile(docname);	
	if (doc == NULL ) {
		fprintf(stderr,"Document not parsed successfully. \n");
		return (NULL);
	}
	
	cur = xmlDocGetRootElement(doc);	
	if (cur == NULL) {
		fprintf(stderr,"empty document\n");
		xmlFreeDoc(doc);
		return (NULL);
	}
	
	if (xmlStrcmp(cur->name, (const xmlChar *) "story")) {
		fprintf(stderr,"document of the wrong type, root node != story");
		xmlFreeDoc(doc);
		return (NULL);
	}
	
	newnode = xmlNewTextChild(cur, NULL, "reference", NULL);
	newattr = xmlNewProp(newnode, "uri", uri);
	return(doc);
}

我們用xmlAttrPtr宣告一個屬性指標。在找到story元素後，用xmlNewTextChild()新建一個reference子元素，用xmlNewProp()給這個子元素新建一個uri屬性。文件修改完後要用xmlSaveFormatFile()寫入到磁碟。
查詢屬性的過程類似。如下：

void
getReference(xmlDocPtr doc, xmlNodePtr cur) {
	xmlChar *uri;
	cur = cur->xmlChildrenNode;
	while (cur != NULL) {
	    if ((!xmlStrcmp(cur->name, (const xmlChar *)"reference"))) {
		    uri = xmlGetProp(cur, "uri");
		    printf("uri: %s\n", uri);
		    xmlFree(uri);
	    }
	    cur = cur->next;
	}
	return;
}

    關鍵函式為xmlGetProp()，用來獲取節點中的指定屬性。注意如果你使用DTD為屬性宣告一個固定的或預設的值，則該函式也查詢這些值。
   4、建立xml文件
   有了上面的基礎，建立一個xml文件顯得非常簡單，就是一個不斷插入節點的過程。其流程如下：
   （1）用xmlNewDoc函式建立一個文件指標doc；
   （2）用xmlNewNode函式建立一個節點指標root_node；
   （3）用xmlDocSetRootElement將root_node設定為doc的根結點；
   （4）用xmlAddChild()給root_node新增一系列的子節點，並設定子節點的內容和屬性；
   （5）用xmlSaveFile將xml文件存入檔案；
   （6）用xmlFreeDoc函式關閉文件指標，並清除本文件中所有節點動態申請的記憶體。
   下面程式碼建立一個xml文件：

#include <stdio.h>
#include <iostream>
#include <libxml/parser.h>
#include <libxml/tree.h>
using namespace std;

int main(int argc, char* argv[]){
	//定義文件和節點指標
	xmlDocPtr doc=xmlNewDoc(BAD_CAST"1.0");
	xmlNodePtr root_node=xmlNewNode(NULL,BAD_CAST"root");
	//設定根節點
	xmlDocSetRootElement(doc,root_node);
	//在根節點中直接建立節點
	xmlNewTextChild(root_node, NULL, BAD_CAST"newNode1", BAD_CAST"newNode1 content");
	xmlNewTextChild(root_node, NULL, BAD_CAST"newNode2", BAD_CAST"newNode2 content");
	xmlNewTextChild(root_node, NULL, BAD_CAST"newNode3", BAD_CAST"newNode3 content");
	//建立一個節點，設定其內容和屬性，然後加入根結點
	xmlNodePtr node=xmlNewNode(NULL, BAD_CAST"node2");
	xmlNodePtr content=xmlNewText(BAD_CAST"NODE CONTENT");
	xmlAddChild(root_node,node);
	xmlAddChild(node,content);
	xmlNewProp(node,BAD_CAST"attribute",BAD_CAST"yes");
	//建立一個兒子和孫子節點
	node=xmlNewNode(NULL,BAD_CAST"son");
	xmlAddChild(root_node,node);
	xmlNodePtr grandson=xmlNewNode(NULL,BAD_CAST"grandson");
	xmlAddChild(node,grandson);
	xmlAddChild(grandson,xmlNewText(BAD_CAST"This is a grandson node"));
	//儲存xml文件
	int nRel=xmlSaveFile("CreatedXml.xml",doc);
	if(nRel!=-1){
		cout<<"一個xml文件被建立，寫入"<<nRel<<"個位元組"<<endl;
	}
	//釋放文件內節點動態申請的記憶體
	xmlFreeDoc(doc);
	return 1;
}

編譯並執行這個程式，將建立CreatedXml.xml文件，內容如下：

<root>
	<newNode1>newNode1 content</newNode1>
	<newNode2>newNode2 content</newNode2>
	<newNode3>newNode3 content</newNode3>
	<node2 attribute="yes">NODE CONTENT</node2>
	<son>
		<grandson>This is a grandson node</grandson>
	</son>
</root>

    注意，有多種方式可以新增子節點。第一是用xmlNewTextChild直接新增一個文字子節點；第二是先建立新節點，然後用xmlAddChild將新節點加入上層節點。
   5、編碼轉換
   資料編碼相容性問題是很多開發人員都會遇到的一大難題，特別是在使用libxml時。libxml內部使用UTF-8格式儲存和操作資料。你的應用程式資料如果使用其他格式的編碼，例如ISO-8859-1編碼，則在傳給libxml之前必須轉換成UTF-8格式。如果你的應用輸出想用非UTF-8格式的編碼，也需要進行轉換。
   Libxml2本身只支援把UTF-8, UTF-16和ISO-8859-1格式的外部資料轉換成內部使用的UTF-8格式，以及處理完後輸出成這些格式的資料。對其他的字元編碼，需要使用libiconv（當然你也可以使用其他的國際化庫，例如ICU）。當前libiconv支援150多種不同的字元編碼，libiconv的實現儘量保證支援所有我們聽過的編碼格式。在使用libxml之前，一般是通過libiconv把資料先轉換UTF-8格式。在使用libxml處理完之後，再通過libiconv把資料輸出成你要的編碼格式。
   一個常見的錯誤是一份程式碼的不同部分的資料使用不同的編碼格式。例如內部資料使用ISO-8859-1格式的應用程式，聯合使用libxml，而它的內部資料格式為UTF-8。這樣應用程式在執行不同的程式碼段時要不同地對待內部資料，這有可能導致解析資料出現錯誤。
   例子1：使用Libxml內建的編碼處理器
   下面的例子建立一個簡單的文件，新增從命令列得到的資料到文件根元素，並以合適的編碼格式輸出到stdout。對提供的資料我們使用ISO-8859-1編碼，處理過程為從ISO-8859-1到UTF-8，再到ISO-8859-1。命令列上輸入的字串從ISO-8859-1格式轉換成UTF-8格式，以供libxml使用，輸出時又重新轉換成ISO-8859-1格式。

#include <string.h>
#include <libxml/parser.h>

/* 對指定編碼格式的外部資料，轉換成libxml使用UTF-8格式 */
unsigned char*
convert(unsigned char *in, char *encoding){
	unsigned char *out;
    int ret,size,out_size,temp;
	/* 定義一個編碼處理器指標 */
    xmlCharEncodingHandlerPtr handler;

    size = (int)strlen((const char*)in)+1; /* 輸入資料長度 */
    out_size = size*2-1; /* 輸出資料長度 */
    out = (unsigned char*)malloc((size_t)out_size); /* 存放輸出資料 */

    if (out) {
		/* 查詢內建的編碼處理器 */
        handler = xmlFindCharEncodingHandler(encoding);
        if(!handler) {
            free(out);
            out = NULL;
        }
    }
    if(out) {
        temp=size-1;
		/* 對輸入資料進行編碼轉換 */
        ret = handler->input(out, &out_size, in, &temp);
        if(ret || temp-size+1) { /* 轉換不成功 */
            if (ret) { /* 轉換失敗 */
                printf("conversion wasn't successful.\n");
            } else { /* 只轉換了一部分資料 */
                printf("conversion wasn't successful. converted: %i octets.\n",temp);
            }
            free(out);
            out = NULL;
        }else { /* 轉換成功 */
            out = (unsigned char*)realloc(out,out_size+1);
            out[out_size]=0; /* 輸出的末尾加上null終止符 */
                        
        }
    } else {
        printf("no mem\n");
    }
    return (out);
}	

int
main(int argc, char **argv) {
	unsigned char *content, *out;
	xmlDocPtr doc;
	xmlNodePtr rootnode;
	char *encoding = "ISO-8859-1";
	
	if (argc <= 1) {
		printf("Usage: %s content\n", argv[0]);
		return(0);
	}

	content = (unsigned char*)argv[1];
	/* 轉換成libxml2使用的UTF-8格式 */
	out = convert(content, encoding);
	doc = xmlNewDoc (BAD_CAST "1.0");
	rootnode = xmlNewDocNode(doc, NULL, (const xmlChar*)"root", out);
	xmlDocSetRootElement(doc, rootnode);
	/* 以ISO-8859-1格式輸出文件內容 */
	xmlSaveFormatFileEnc("-", doc, encoding, 1);
	return (1);
}

編譯執行這個程式，假設在命令列上提供的資料"zhou"是ISO-8859-1格式（我的系統中不是），則輸出文件為：

<?xml version="1.0" encoding="ISO-8859-1"?>
<root>zhou</root>

    編碼轉換的基本流程如下：
   （1）用xmlCharEncodingHandlerPtr定義一個編碼處理器指標，用xmlFindCharEncodingHandler()查詢libxml2中指定的編碼處理器。libxml2內建只支援把UTF-8, UTF-16和ISO-8859-1格式的外部資料轉換成內部使用的UTF-8格式。如果要轉換其他格式的資料（如中文編碼），則要使用獨立的libiconv庫給libxml2註冊新編碼處理器。
   （2）呼叫編碼處理器的input()函式，把外部資料轉換成libxml2使用的格式。
   （3）進行xml處理，處理完若要儲存成非UTF-8格式的文件，使用xmlSaveFormatFileEnc()函式。若儲存的編碼格式libxml2不支援，則只能用libiconv把儲存的文件轉換成需要的編碼格式。
   例子2：通過iconv庫給Libxml註冊新的編碼處理器
   下面例子先編寫GBK的編碼處理器gbk_input()和gbk_output()，前者是GBK到UTF-8輸入處理，後者是UTF-8到GBK輸出處理，這兩個處理器都要用到iconv轉換函式。然後呼叫xmlNewCharEncodingHandler()註冊輸入輸出處理器。對輸入輸出資料的編碼轉換由convertToUTF8From()和utf8ConvertTo()來完成，它們都是呼叫xmlFindCharEncodingHandler()查詢已註冊的處理器，然後在處理器上呼叫input()或output()對資料進行編碼轉換。

#include <string.h>
#include <iconv.h>
#include <libxml/encoding.h>
#include <libxml/xmlwriter.h>
#include <libxml/xmlreader.h>

/* 輸入編碼處理器：GBK到UTF-8 */
int gbk_input(unsigned char *out, int *outlen, 
		const unsigned char *in, int *inlen){

	char *outbuf = (char *) out;
	char *inbuf = (char *) in;
	iconv_t iconv_from; /* gbk到utf-8的轉換描述符 */
	size_t len1, len2, rslt;
	/* 注意一般不直接從int*到size_t*的轉換
	   這在32位平臺下是正常的，但到了64平臺下size_t為64位，
	   那(size_t*)inlen將是一個未知的資料 
	*/
	len1 = *inlen;
	len2 = *outlen;
	/* 分配一個從GBK到UTF-8的轉換描述符 */
	iconv_from = iconv_open("utf-8","gbk");
	/* 根據轉換描述符，對資料進行編碼轉換 */
	rslt = iconv(iconv_from, &inbuf, &len1, &outbuf, &len2);
	if(rslt < 0){
		return rslt;
	}
	iconv_close(iconv_from); /* 釋放描述符 */
	*outlen = ((unsigned char *) outbuf - out);
	*inlen = ((unsigned char *) inbuf - in);
	return *outlen;
}

/* 輸出編碼處理器：UTF-8到GBK */
int gbk_output(unsigned char *out, int *outlen, 
				const unsigned char *in, int *inlen){

	char *outbuf = (char *) out;
	char *inbuf = (char *) in;
	iconv_t iconv_to; /* utf-8到gbk的轉換描述符 */
	size_t len1, len2, rslt;
	/* 注意一般不直接從int*到size_t*的轉換
	   這在32位平臺下是正常的，但到了64平臺下size_t為64位，
	   那(size_t*)inlen將是一個未知的資料 
	*/
	len1 = *inlen;
	len2 = *outlen;
	/* 分配一個從UTF-8到GBK的轉換描述符 */
	iconv_to=iconv_open("gbk","utf-8");
	/* 根據轉換描述符，對資料進行編碼轉換 */
	rslt = iconv(iconv_to, &inbuf, &len1, &outbuf, &len2);
	if(rslt < 0){
		return rslt;
	}
	iconv_close(iconv_to); /* 釋放描述符 */
	*outlen = ((unsigned char *) outbuf - out);
	*inlen = ((unsigned char *) inbuf - in);
	return *outlen;
}

/**
 * convertToUTF8From:
 * 把encoding編碼的輸入資料in轉換成utf-8格式返回
 * 出錯則返回NULL
 */
xmlChar *convertToUTF8From(const char *in, const char *encoding){
    xmlChar *out;
    int ret;
    int size;
    int out_size;
    int temp;
    xmlCharEncodingHandlerPtr handler;
    if (in == 0)
		return 0;
	/* 查詢內建的編碼處理器 */
    handler = xmlFindCharEncodingHandler(encoding);
    if (!handler) {
        printf("convertToUTF8From: no encoding handler found for '%s'\n",
               encoding ? encoding : "");
        return 0;
    }
    size = (int)strlen(in) + 1;  /* 輸入資料長度 */
    out_size = size*2 - 1;  /* 輸出資料長度 */
	/* 存放輸出資料 */
    out = (unsigned char *) xmlMalloc((size_t) out_size);
	memset(out, 0, out_size);

    if(out != NULL) {
        temp = size - 1;
		/* 對輸入資料進行編碼轉換，成功後返回0 */
        ret = handler->input(out, &out_size, (const xmlChar *) in, &temp);
        if(ret || temp - size + 1) {  /* 轉換不成功 */
            if(ret){  /* 轉換失敗 */
                printf("convertToUTF8From: conversion wasn't successful.\n");
            }else{  /* 只轉換了一部分資料 */
                printf("convertToUTF8From: conversion wasn't successful. converted: %i octets.\n", temp);
            }
            xmlFree(out); /* 釋放輸出緩衝區 */
            out = 0;
        }else{  /* 轉換成功，在輸出末尾加上null終止符 */
            out = (unsigned char *) xmlRealloc(out, out_size + 1);
            out[out_size] = 0;
        }
    } else {
        printf("convertToUTF8From: no mem\n");
    }
    return out;
}

/**
 * utf8ConvertTo:
 * 把utf-8的資料轉換成encoding編碼返回
 * 出錯則返回NULL
 */
char *utf8ConvertTo(xmlChar *in, const char *encoding){
    char *out;
    int ret;
    int size;
    int out_size;
    int temp;
    xmlCharEncodingHandlerPtr handler;

    if (in == 0)
        return 0;

    handler = xmlFindCharEncodingHandler(encoding);

    if (!handler) {
        printf("utf8ConvertTo: no encoding handler found for '%s'\n",
               encoding ? encoding : "");
        return 0;
    }

    size = (int) strlen((char*)in) + 1;  /* 輸入資料長度 */
    out_size = size * 2 - 1;  /* 輸出資料長度 */
    out = (char *) malloc((size_t) out_size);  /* 存放輸出資料 */
	memset(out,0,out_size);
    if(out != NULL) {
        temp = size - 1;
		/* 對輸入資料進行編碼轉換，成功後返回0 */
        ret = handler->output((xmlChar*)out, &out_size, (const xmlChar *) in, &temp);
        if(ret || temp - size + 1){
            if(ret){
                printf("utf8ConvertTo: conversion wasn't successful.\n");
            }else{
                printf("utf8ConvertTo: conversion wasn't successful. converted: %i octets.\n", temp);
            }
            free(out);
            out = 0;
        }else{
            out = (char *) realloc(out, out_size + 1);
            out[out_size] = 0;  /* 末尾加上null終止符 */
        }
    }else{
        printf("utf8ConvertTo: no mem\n");
    }

    return out;
}

int main(int argc, char **argv){
	const char *content;
	xmlChar *out;
	xmlDocPtr doc;
	xmlNodePtr rootnode;
	
	if (argc <= 1) {
		printf("Usage: %s content\n", argv[0]);
		return(0);
	}
	content = (const char*)argv[1];

	/* 新增gbk編碼支援 */
	xmlNewCharEncodingHandler("gbk", gbk_input, gbk_output);
	/* 新增gb2312編碼支援：仍然可以使用GBK的輸入輸出處理器 */
	xmlNewCharEncodingHandler("gb2312", gbk_input, gbk_output);

	/* 輸入的GBK資料轉換成libxml2使用的UTF-8格式 */
	out = convertToUTF8From(content, "gbk");
	/* 建立xml文件 */
	doc = xmlNewDoc(BAD_CAST "1.0");
	rootnode = xmlNewDocNode(doc, NULL, (const xmlChar*)"root", out);
	xmlDocSetRootElement(doc, rootnode);
	/* 以gb2312格式儲存文件內容："-"表示輸出到終端 */
	xmlSaveFormatFileEnc("-", doc, "gb2312", 1);
	
	xmlCleanupCharEncodingHandlers()；/* 釋放編碼處理器資源 */
	return (1);
}

    這個例子在32位與64位Linux平臺下測試通過。iconv庫是Linux預設自帶的元件，因此在Linux中使用libxml非常方便。我們先建立utf-8編碼與gbk編碼的轉換介面，並將介面插入到libxml2庫中，這樣xml庫就支援對gb2312和gbk編碼的支援了。當然，這個轉換不會自動完成，我們需要使用從libxml庫中查詢特定編碼的介面，libxml支援一些基本的編碼介面，如ISO-8859-1，UTF-16等編碼，但不支援gbk，所以在上述程式碼中，我們定義了gbk_input，與gbk_output兩個介面，這兩個介面的原型宣告是libxml庫的標準宣告，即xmlCharEncodingInputFunc和xmlCharEncodingOutputFunc。在使用完libxml庫之後，我們需要釋放libxml庫的轉換資源。
   例子3：直接使用iconv庫進行轉換
   下面例子直接使用iconv函式對輸入輸出進行編碼轉換，而不是通過註冊編碼處理器的方式。

#include <stdio.h>
#include <string.h>
#include <iconv.h>
#include <libxml/parser.h>
#include <libxml/tree.h>

/* 程式碼轉換:從一種編碼轉為另一種編碼 */
int encoding_convert(const char *from_charset, const char *to_charset, 
			char *inbuf, int inlen, 
			char* outbuf, int outlen){

    iconv_t cd;
	size_t len1, len2, rslt;

	/* 注意一般不直接從int*到size_t*的轉換
	   這在32位平臺下是正常的，但到了64平臺下size_t為64位，
	   那(size_t*)inlen將是一個未知的資料 
	*/
	len1 = inlen;
	len2 = outlen;
	/* 分配一個轉換描述符 */
    cd = iconv_open(to_charset,from_charset);
    if(cd == 0)
       return -1;
    memset(outbuf,0,len2); 
	/* 執行編碼轉換 */
    rslt=iconv(cd, &inbuf, &len1, &outbuf, &len2);
    if(rslt== -1)
		return -1;  

    iconv_close(cd); /* 釋放描述符 */
    return 0;  

}

/* GB2312轉換為UTF-8 
 * 成功則返回一個動態分配的char*變數，需要在使用完畢後手動free，失敗返回NULL
 */
char *gb2312_utf8(char *inbuf){
	int nOutLen = 2*strlen(inbuf)-1;
	char *szOut=(char*)xmlMalloc(nOutLen);
	if(-1 == encoding_convert("gb2312","uft-8",inbuf,strlen(inbuf),szOut,nOutLen)){
		xmlFree(szOut);
		szOut=NULL;
	}
	return szOut;
}

/* UTF-8轉換為GB2312
 * 成功則返回一個動態分配的char*變數，需要在使用完畢後手動free，失敗返回NULL
 */
char *utf8_gb2312(char *inbuf){
	int nOutLen = 2* strlen(inbuf)-1;
	char *szOut=(char*)xmlMalloc(nOutLen);
	if(-1 == encoding_convert("utf-8","gb2312",inbuf,strlen(inbuf),szOut,nOutLen)){
		xmlFree(szOut);
		szOut=NULL;
	}
	return szOut;
}

int main(int argc, char **argv){
	/* 定義文件節點和指標 */
	xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0");
	xmlNodePtr root_node=xmlNewNode(NULL, BAD_CAST "root");
	/* 設定根節點 */
	xmlDocSetRootElement(doc, root_node);

	/* 一箇中文字串轉換為UTF-8字串，然後寫入 */
	char *szOut=gb2312_utf8("節點1的內容");
	/* 在根節點中直接建立節點 */
	xmlNewTextChild(root_node, NULL, BAD_CAST "newNode1", BAD_CAST "newNode1 content");
	xmlNewTextChild(root_node, NULL, BAD_CAST "newNode2", BAD_CAST "newNode2 content");
    xmlNewTextChild(root_node, NULL, BAD_CAST "newNode3", BAD_CAST "newNode3 content");
	xmlNewChild(root_node, NULL, BAD_CAST "node1",BAD_CAST szOut);
	xmlFree(szOut);

	/* 建立一個節點，設定其內容和屬性，然後加入根結點 */
    xmlNodePtr node = xmlNewNode(NULL,BAD_CAST "node2");
    xmlNodePtr content = xmlNewText(BAD_CAST "NODE CONTENT");
    xmlAddChild(root_node,node);
    xmlAddChild(node,content);
    szOut = gb2312_utf8("屬性值");
    xmlNewProp(node,BAD_CAST "attribute",BAD_CAST szOut);
    xmlFree(szOut);

	/* 建立一箇中文節點 */
    szOut = gb2312_utf8("中文節點");
    xmlNewChild(root_node, NULL, BAD_CAST szOut,BAD_CAST "content of chinese node");
    xmlFree(szOut);

    /* 儲存xml文件 */
    int nRel = xmlSaveFormatFileEnc("CreatedXml_cn.xml",doc,"GB2312",1);
    if (nRel != -1){
		printf("一個xml文件被建立,寫入%d個位元組", nRel);
    }

    xmlFreeDoc(doc);
    return 1;
}

這個例子中，當把中文資料寫入到XML節點時，使用gb2312_utf8()直接轉換成UTF-8格式，這種直接通過iconv轉換的方式更高效。編譯並執行程式，輸出文件如下：

<?xml version="1.0" encoding="GB2312"?>
<root>
	<newNode1>newNode1 content</newNode1>
	<newNode2>newNode2 content</newNode2>
	<newNode3>newNode3 content</newNode3>
	<node1>節點1的內容</node1>
	<node2 attribute="屬性值">NODE CONTENT</node2>
	<中文節點>content of chinese node</中文節點>
</root>

    6、一個真實的例子
   內容整理自http://xmlsoft.org/example.html。
   下面是一個真實的例子。應用程式資料的內容不使用DOM樹，而是使用內部資料結構來儲存。這是一個基於XML儲存結構的資料庫，它儲存了與Gnome相關的任務。如下：

<?xml version="1.0"?>
<gjob:Helping xmlns:gjob="http://www.gnome.org/some-location">
  <gjob:Jobs>

    <gjob:Job>
      <gjob:Project ID="3"/>
      <gjob:Application>GBackup</gjob:Application>
      <gjob:Category>Development</gjob:Category>

      <gjob:Update>
        <gjob:Status>Open</gjob:Status>
        <gjob:Modified>Mon, 07 Jun 1999 20:27:45 -0400 MET DST</gjob:Modified>
        <gjob:Salary>USD 0.00</gjob:Salary>
      </gjob:Update>

      <gjob:Developers>
        <gjob:Developer>
        </gjob:Developer>
      </gjob:Developers>

      <gjob:Contact>
        <gjob:Person>Nathan Clemons</gjob:Person>
        <gjob:Email>[email protected]</gjob:Email>
        <gjob:Company>
        </gjob:Company>
        <gjob:Organisation>
        </gjob:Organisation>
        <gjob:Webpage>
        </gjob:Webpage>
        <gjob:Snailmail>
        </gjob:Snailmail>
        <gjob:Phone>
        </gjob:Phone>
      </gjob:Contact>

      <gjob:Requirements>
      The program should be released as free software, under the GPL.
      </gjob:Requirements>

      <gjob:Skills>
      </gjob:Skills>

      <gjob:Details>
      A GNOME based system that will allow a superuser to configure 
      compressed and uncompressed files and/or file systems to be backed 
      up with a supported media in the system.  This should be able to 
      perform via find commands generating a list of files that are passed 
      to tar, dd, cpio, cp, gzip, etc., to be directed to the tape machine 
      or via operations performed on the filesystem itself. Email 
      notification and GUI status display very important.
      </gjob:Details>

    </gjob:Job>

  </gjob:Jobs>
</gjob:Helping>

把XML檔案載入到一個內部DOM樹中只是呼叫幾個函式的問題，而遍歷整個樹來收集資料，並生成內部結構則更困難，也更容易出錯。
對輸入結構的定義法則是非常寬鬆的。屬性的順序無關緊要（XML規範清楚地說明了這一點），不要依賴於一個節點的子節點順序通常是一個好的主意，除非這樣做真的使事情變得更困難了。下面是解析person資訊的一段程式碼：

/*
 * 一個person記錄
 */
typedef struct person {
    char *name;
    char *email;
    char *company;
    char *organisation;
    char *smail;
    char *webPage;
    char *phone;
} person, *personPtr;

/*
 * 解析person的程式碼
 */
personPtr parsePerson(xmlDocPtr doc, xmlNsPtr ns, xmlNodePtr cur) {
    personPtr ret = NULL;

DEBUG("parsePerson\n");
    /*
     * 為結構分配記憶體
     */
    ret = (personPtr) malloc(sizeof(person));
    if (ret == NULL) {
        fprintf(stderr,"out of memory\n");
        return(NULL);
    }
    memset(ret, 0, sizeof(person));

    /* 我們不關心頂層的元素名是什麼 */
    cur = cur->xmlChildrenNode;
    while (cur != NULL) {
        if ((!strcmp(cur->name, "Person")) && (cur->ns == ns))
            ret->name = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
        if ((!strcmp(cur->name, "Email")) && (cur->ns == ns))
            ret->email = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
        cur = cur->next;
    }

    return(ret);
}

    下面是要注意的一些事項：
   （1）通常一個遞迴的解析風格是更方便的：XML資料天然地遵循重複式地構造，並且是高度結構化的。
   （2）兩個引數是xmlDocPtr和xmlNsPtr型別，即指向XML文件和應用程式保留的名稱空間的指標。文件資訊非常廣泛，為你的應用程式資料集定義一個名稱空間並測試元素和屬性是否屬性這個空間是一個好的程式設計實踐。這隻需一個簡單的相等測試（cur->ns == ns）。
   （3）為了查詢文字和屬性值，你可以使用函式xmlNodeListGetString()來獲取所有文字，和由DOM輸出生成的引用節點，並生成一個單一的文字字串。
   下面是解析另外一個結構的程式碼片段：

#include <libxml/tree.h>
/*
 * 一個Job的描述
 */
typedef struct job {
    char *projectID;
    char *application;
    char *category;
    personPtr contact;
    int nbDevelopers;
    personPtr developers[100]; /* using dynamic alloc is left as an exercise */
} job, *jobPtr;

/*
 * 解析Job的程式碼
 */
jobPtr parseJob(xmlDocPtr doc, xmlNsPtr ns, xmlNodePtr cur) {
    jobPtr ret = NULL;

DEBUG("parseJob\n");
    /*
     * 為結構分配記憶體
     */
    ret = (jobPtr) malloc(sizeof(job));
    if (ret == NULL) {
        fprintf(stderr,"out of memory\n");
        return(NULL);
    }
    memset(ret, 0, sizeof(job));

    /* 我們不關心頂層元素名是什麼 */
    cur = cur->xmlChildrenNode;
    while (cur != NULL) {
        
        if ((!strcmp(cur->name, "Project")) && (cur->ns == ns)) {
            ret->projectID = xmlGetProp(cur, "ID");
            if (ret->projectID == NULL) {
                fprintf(stderr, "Project has no ID\n");
            }
        }
        if ((!strcmp(cur->name, "Application")) && (cur->ns == ns))
            ret->application = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
        if ((!strcmp(cur->name, "Category")) && (cur->ns == ns))
            ret->category = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
        if ((!strcmp(cur->name, "Contact")) && (cur->ns == ns))
            ret->contact = parsePerson(doc, ns, cur);
        cur = cur->next;
    }

    return(ret);
}

一旦你會使用libxml2，編寫這種型別的程式碼是非常簡單的，也很無趣。最終，你可以寫一個擁有C資料結構和一組XML文件例子或一個XML DTD的樁模組，並生成在C資料和XML儲存之間匯入和匯出資料的程式碼。

    7、詳細程式碼示例
   對Libxml2更詳細的使用介紹，可參考官方的詳細程式碼示例http://xmlsoft.org/examples/index.html。上面提供了Libxml2各個元件怎麼使用的詳細程式碼示例，包括以下部分：
   xmlWriter: 測試xmlWriter的各個API，包括寫入到檔案、寫入到記憶體緩衝區、寫入到新的文件或子樹、字串編碼轉換、對輸出文件進行序列化。
   InputOutput: 演示使用xmlRegisterInputCallbacks來建立一個客戶I/O層，這被用在XInclude方法上下文中，以顯示怎樣構建動態文件。還演示使用xmlDocDumpMemory來輸出文件到字元緩衝區中。
   Parsing: 演示使用xmlReadMemory()讀取XML文件，xmlFreeDoc()釋放文件樹；使用xmlCreatePushParserCtxt()和xmlParseChunk()一塊一塊地讀取XML文件到文件樹中。演示為XML文件建立一個解析上下文，然後解析並驗證這個文件；建立一個文件樹，檢查並驗證結果，最後用xmlFreeDoc()釋放文件樹。演示使用xmlReadFile()讀取XML文件並用xmlFreeDoc()釋放它。
   Tree: 演示怎樣建立文件和節點，並把資料dump到標準輸出或檔案中。演示使用xmlDocGetRootElement()獲取根元素，然後遍歷文件並列印各個元素名。
   XPath: 演示怎樣計算XPath表示式，並在XPath上下文註冊名稱空間，列印結果節點集。演示怎麼載入一個文件、用XPath定位到某個子元素、修改這個元素並儲存結果。這包含了載入/編輯/儲存的一個完整來回。
   xmlReader: 演示使用xmlReaderForFile()解析XML文件，並dump出節點的資訊。演示在用xmlReaderForFile()解析時驗證文件的內容，啟用各種選項，諸如實體替換、DTD屬性不一致等。演示使用xmlTextReaderPreservePattern()提取XML文件中某一部分的子文件。演示重用xmlReader物件來解析多個XML文件。

libxml2剖析(3)：使用教程

libxml2剖析(3)：使用教程

OpenMPI源碼剖析3：

附件3：eclipse memory analyze使用教程

tensorflow基本教程3：variable

易學筆記-RabbitMQ教程3：一個訊息發給所有消費者（路由器型別為：fanout）

Dapper官方教程翻譯3：Dapper方法之Query

#地圖故事#教程3：如何製作分級大小地圖

GreenDao教程(3)：一對一，一對多，多對多

R極簡教程-3：R及RStudio的安裝

Ogre基礎教程3：地形，天空，煙霧

C語言專案教程3：求給定數的百位、十位和個位

Cocos2d-x 3.9教程：9. Cocos2d-x中基於佈局的容器控制元件

Cocos2d-x 3.9教程：10.使用CocosStudio的UI編輯器從UI檔案中載入佈局和控制元件

python3基礎教程專案3：萬能的XML

1：總結並剖析malloc/free和new/delete之間關係和差異。 2：剖析new/delete、new[]/delete[]到底做了些什麼事情。 3：實現NEW_ARRAY/DE

Git詳細使用教程(3)：git add, git commit詳解

在.NET Core中使用MongoDB明細教程(3)：Skip, Sort, Limit, Projections

Raft算法系列教程3：日誌複製

MSP430WARE++的使用3：modbus模塊的調用方法

增強學習Reinforcement Learning經典算法梳理3：TD方法

libxml2剖析(3)：使用教程

相關推薦