c#的httpAgilityPack是一款优秀的HTML解析库。最重要的是可以通过XPath来解析。可惜我们用C++的只能通过托管代码的方式来调用了。
托管代码设置:
ALT+F7 打开属性页-属性配置-常规-公共语言运行时支持-公共语言运行时支持(clr)
#include<afxwin.h> #include<string> using namespace std; using namespace System; #using <System.Xml.dll> #using "..\Debug\HtmlAgilityPack.dll" using namespace HtmlAgilityPack; #include<list> list<string> _links; using namespace System::Runtime::InteropServices; //Marshal //System::String转std::string void MarshalString ( String^ s, string& os ) { const char* chars = (const char*)(Marshal::StringToHGlobalAnsi(s)).ToPointer(); os = chars; Marshal::FreeHGlobal(IntPtr((void*)chars)); } inline void ParseLink(HtmlNode^ node, String^ name) { HtmlAttribute ^att = node->Attributes[name]; if(att == nullptr) return; String^ str = node->Name; if ((name == "href") && (node->Name != "link")) return; string strValue ; MarshalString(att->Value,strValue); _links.push_back(strValue); } void CTESTDlg::OnBnClickedOk() { string url = "http://www.sina.com";; String^ str2 = gcnew String(url.c_str()); HtmlWeb ^hw = gcnew HtmlWeb(); HtmlDocument ^doc = hw->Load(str2); doc->Save("mshome.htm"); HtmlNodeCollection ^atts = doc->DocumentNode->SelectNodes("//*[@background or @lowsrc or @src or @href]"); for(int i = 0;i < atts->Count;i ++) { ParseLink(atts[i], "background"); ParseLink(atts[i], "href"); ParseLink(atts[i], "src"); ParseLink(atts[i], "lowsrc"); } string strurl; list<string>::iterator v; for(v = _links.begin();v != _links.end();++v) { strurl += (*v); strurl += " "; } m_edit.SetWindowTextA(strurl.c_str()); CString str = ":"; int a =1; }
测试源代码地址