项目地址:https://github.com/tenderlove/libxml2 参考代码文件:https://github.com/GNOME/libxml2/blob/master/HTMLparser.c
安装库:apt install libxml2-dev -y
1. 解析 html 内容
1.1 从文件解析
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
std::string filepath = "abc.html";
xmlDocPtr doc = htmlReadFile(filepath.data(), NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
或者从打开的文件中(socket)读取
htmlDocPtr htmlReadFd(int fd, const char *url, const char *encoding, int options)
1.2 从内存中解析
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
std::string content = "<html></html>";
htmlReadMemory(html.data(), html.size(), NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
1.3 使用 xpath 进行标签检索
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
std::string filepath = "abc.html";
xmlDocPtr doc = htmlReadFile(filepath.data(), NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
std::string xname = "//a"; # "//div[@class='ok' and @data='a']" 检索指定属性的 div
xmlXPathContextPtr xpathctx = xmlXPathNewContext(doc);
XmlXPathObject xpathobj = xmlXPathEvalExpression(BAD_CAST xname.data(), xpathctx);
for(int i = 0; i < xpathobj->nodesetval->nodeNr; i++) {
xmlNode * node = xpathobj->nodesetval->nodeTab[pos];
xmlChar* prop = xmlGetProp(node_, BAD_CAST name.data());
std::string value = reinterpret_cast<const char*>(prop);
xmlFree(prop);
xmlChar* text = xmlNodeGetContent(node_);
std::string value = reinterpret_cast<const char*>(text);
xmlFree(text);
}
xmlXPathFreeObject(xpathobj);
xmlXPathFreeContext(xpathctx);
xmlFreeDoc(doc);
xmlCleanupParser();