现在的位置: 首页 > 综合 > 正文

iOS解析HTML

2013年09月14日 ⁄ 综合 ⁄ 共 5641字 ⁄ 字号 评论关闭

xml,json都有大量的库来解析,我们如何解析html呢?



TFHpple是一个小型的封装,可以用来解析html,它是对libxml的封装,语法是xpath。

今天我看到一个直接用libxml来解析html,参看:http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090 那张图画得一目了然,很值得收藏。这个文章中的源码不能遍历所有的html,我做了一点修改可以将html遍历打印出来


// NSData data contains the document data 

// encoding is the NSStringEncoding of the data 

// baseURL the documents base URL, i.e. location 


CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding); 

CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc); 

const char *enc = CFStringGetCStringPtr(cfencstr, 0); 


htmlDocPtr _htmlDocument = htmlReadDoc([data bytes], 

[[baseURL absoluteString] UTF8String], 

enc, 

XML_PARSE_NOERROR | XML_PARSE_NOWARNING); 

if (_htmlDocument) 



xmlFreeDoc(_htmlDocument); 




xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument; 


while (currentNode) 



// output node if it is an element 


if (currentNode->type == XML_ELEMENT_NODE) 



NSMutableArray *attrArray = [NSMutableArray array]; 


for (xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next) 



xmlNodePtr contents = attrNode->children; 


[attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]]; 




NSString *attrString = [attrArray componentsJoinedByString:@" "]; 


if ([attrString length]) 



attrString = [@" " stringByAppendingString:attrString]; 




NSLog(@"<%s%@>", currentNode->name, attrString); 



else if (currentNode->type == XML_TEXT_NODE) 



//NSLog(@"%s", currentNode->content); 

NSLog(@"%@", [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]); 



else if (currentNode->type == XML_COMMENT_NODE) 



NSLog(@"/* %s */", currentNode->name); 





if (currentNode && currentNode->children) 



currentNode = currentNode->children; 



else if (currentNode && currentNode->next) 



currentNode = currentNode->next; 



else 



currentNode = currentNode->parent; 


// close node 

if (currentNode && currentNode->type == XML_ELEMENT_NODE) 



NSLog(@"</%s>", currentNode->name); 




if (currentNode->next) 



currentNode = currentNode->next; 



else 



while(currentNode) 



currentNode = currentNode->parent; 

if (currentNode && currentNode->type == XML_ELEMENT_NODE) 



NSLog(@"</%s>", currentNode->name); 

if (strcmp((const char *)currentNode->name, "table") == 0) 



NSLog(@"over"); 






if (currentNode == nodes->nodeTab[0]) 



break; 




if (currentNode && currentNode->next) 



currentNode = currentNode->next; 

break; 










if (currentNode == nodes->nodeTab[0]) 



break; 



}



不过我还是喜欢用TFHpple,因为它很简单,也好用,但是它的功能不是很完完善。比如,不能获取children node,我就写了两个方法,一个是获取children node,一个是获取所有的contents. 还有node的属性content的key与node's content的key一样,都是@"nodeContent", 正确情况下属性的应是@"attributeContent",

所以我写了这个方法,同时修改node属性的content key.

NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult) 



NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary]; 


if (currentNode->name) 



NSString *currentNodeContent = 

[NSString stringWithCString:(const char *)currentNode->name encoding:NSUTF8StringEncoding]; 

[resultForNode setObject:currentNodeContent forKey:@"nodeName"]; 




if (currentNode->content) 



NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]; 


if (currentNode->type == XML_TEXT_NODE) 



if (currentNode->parent->type == XML_ELEMENT_NODE) 



[parentResult setObject:currentNodeContent forKey:@"nodeContent"]; 

return nil; 




if (currentNode->parent->type == XML_ATTRIBUTE_NODE) 



[parentResult 

setObject: 

[currentNodeContent 

stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]] 

forKey:@"attributeContent"]; 

return nil; 











xmlAttr *attribute = currentNode->properties; 

if (attribute) 



NSMutableArray *attributeArray = [NSMutableArray array]; 

while (attribute) 



NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary]; 

NSString *attributeName = 

[NSString stringWithCString:(const char *)attribute->name encoding:NSUTF8StringEncoding]; 

if (attributeName) 



[attributeDictionary setObject:attributeName forKey:@"attributeName"]; 




if (attribute->children) 



NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary); 

if (childDictionary) 



[attributeDictionary setObject:childDictionary forKey:@"attributeContent"]; 






if ([attributeDictionary count] > 0) 



[attributeArray addObject:attributeDictionary]; 



attribute = attribute->next; 




if ([attributeArray count] > 0) 



[resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"]; 






xmlNodePtr childNode = currentNode->children; 

if (childNode) 



NSMutableArray *childContentArray = [NSMutableArray array]; 

while (childNode) 



NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode); 

if (childDictionary) 



[childContentArray addObject:childDictionary]; 



childNode = childNode->next; 



if ([childContentArray count] > 0) 



[resultForNode setObject:childContentArray forKey:@"nodeChildArray"]; 






return resultForNode; 

}

TFHppleElement.m里加了两个key 常量

NSString * const TFHppleNodeAttributeContentKey = @"attributeContent"; 

NSString * const TFHppleNodeChildArrayKey = @"nodeChildArray";

并修改获取属性方法为:

- (NSDictionary *) attributes 



NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary]; 

for (NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) { 

[translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey] 

forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]]; 



return translatedAttributes; 

}

并添加获取children node 方法:

- (BOOL) hasChildren 



NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey]; 


if (childs) 



return YES; 




return NO; 




- (NSArray *) children 



if ([self hasChildren]) 

return [node objectForKey: TFHppleNodeChildArrayKey]; 

return nil; 

}

 

参看:http://giles-wang.blogspot.com/2011/08/iphoneansi.html

原文:http://blog.csdn.net/favormm/article/details/6794487

抱歉!评论已关闭.