xml,json都有大量的库来解析,我们如何解析html呢?
TFHpple是一个小型的封装,可以用来解析html,它是对libxml的封装,语法是xpath。
今天我看到一个直接用libxml来解析html,参看:http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090 那张图画得一目了然,很值得收藏。这个文章中的源码不能遍历所有的html,我做了一点修改可以将html遍历打印出来
- // NSData data contains the document data
- // encoding is the NSStringEncoding of the data
- // baseURL the documents base URL, i.e. location
- CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding);
- CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc);
- const char *enc = CFStringGetCStringPtr(cfencstr, 0);
- htmlDocPtr _htmlDocument = htmlReadDoc([data bytes],
- [[baseURL absoluteString] UTF8String],
- enc,
- XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
- if (_htmlDocument)
- {
- xmlFreeDoc(_htmlDocument);
- }
- xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument;
- while (currentNode)
- {
- // output node if it is an element
- if (currentNode->type == XML_ELEMENT_NODE)
- {
- NSMutableArray *attrArray = [NSMutableArray array];
- for (xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next)
- {
- xmlNodePtr contents = attrNode->children;
- [attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]];
- }
- NSString *attrString = [attrArray componentsJoinedByString:@" "];
- if ([attrString length])
- {
- attrString = [@" " stringByAppendingString:attrString];
- }
- NSLog(@"<%s%@>", currentNode->name, attrString);
- }
- else if (currentNode->type == XML_TEXT_NODE)
- {
- //NSLog(@"%s", currentNode->content);
- NSLog(@"%@", [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]);
- }
- else if (currentNode->type == XML_COMMENT_NODE)
- {
- NSLog(@"/* %s */", currentNode->name);
- }
- if (currentNode && currentNode->children)
- {
- currentNode = currentNode->children;
- }
- else if (currentNode && currentNode->next)
- {
- currentNode = currentNode->next;
- }
- else
- {
- currentNode = currentNode->parent;
- // close node
- if (currentNode && currentNode->type == XML_ELEMENT_NODE)
- {
- NSLog(@"</%s>", currentNode->name);
- }
- if (currentNode->next)
- {
- currentNode = currentNode->next;
- }
- else
- {
- while(currentNode)
- {
- currentNode = currentNode->parent;
- if (currentNode && currentNode->type == XML_ELEMENT_NODE)
- {
- NSLog(@"</%s>", currentNode->name);
- if (strcmp((const char *)currentNode->name, "table") == 0)
- {
- NSLog(@"over");
- }
- }
- if (currentNode == nodes->nodeTab[0])
- {
- break;
- }
- if (currentNode && currentNode->next)
- {
- currentNode = currentNode->next;
- break;
- }
- }
- }
- }
- if (currentNode == nodes->nodeTab[0])
- {
- break;
- }
- }
// NSData data contains the document data // encoding is the NSStringEncoding of the data // baseURL the documents base URL, i.e. location CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding); CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc); const char *enc = CFStringGetCStringPtr(cfencstr, 0); htmlDocPtr _htmlDocument = htmlReadDoc([data bytes], [[baseURL absoluteString] UTF8String], enc, XML_PARSE_NOERROR | XML_PARSE_NOWARNING); if (_htmlDocument) { xmlFreeDoc(_htmlDocument); } xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument; while (currentNode) { // output node if it is an element if (currentNode->type == XML_ELEMENT_NODE) { NSMutableArray *attrArray = [NSMutableArray array]; for (xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next) { xmlNodePtr contents = attrNode->children; [attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]]; } NSString *attrString = [attrArray componentsJoinedByString:@" "]; if ([attrString length]) { attrString = [@" " stringByAppendingString:attrString]; } NSLog(@"<%s%@>", currentNode->name, attrString); } else if (currentNode->type == XML_TEXT_NODE) { //NSLog(@"%s", currentNode->content); NSLog(@"%@", [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]); } else if (currentNode->type == XML_COMMENT_NODE) { NSLog(@"/* %s */", currentNode->name); } if (currentNode && currentNode->children) { currentNode = currentNode->children; } else if (currentNode && currentNode->next) { currentNode = currentNode->next; } else { currentNode = currentNode->parent; // close node if (currentNode && currentNode->type == XML_ELEMENT_NODE) { NSLog(@"</%s>", currentNode->name); } if (currentNode->next) { currentNode = currentNode->next; } else { while(currentNode) { currentNode = currentNode->parent; if (currentNode && currentNode->type == XML_ELEMENT_NODE) { NSLog(@"</%s>", currentNode->name); if (strcmp((const char *)currentNode->name, "table") == 0) { NSLog(@"over"); } } if (currentNode == nodes->nodeTab[0]) { break; } if (currentNode && currentNode->next) { currentNode = currentNode->next; break; } } } } if (currentNode == nodes->nodeTab[0]) { break; } }
不过我还是喜欢用TFHpple,因为它很简单,也好用,但是它的功能不是很完完善。比如,不能获取children node,我就写了两个方法,一个是获取children node,一个是获取所有的contents. 还有node的属性content的key与node's content的key一样,都是@"nodeContent", 正确情况下属性的应是@"attributeContent",
所以我写了这个方法,同时修改node属性的content key.
- NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult)
- {
- NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary];
- if (currentNode->name)
- {
- NSString *currentNodeContent =
- [NSString stringWithCString:(const char *)currentNode->name encoding:NSUTF8StringEncoding];
- [resultForNode setObject:currentNodeContent forKey:@"nodeName"];
- }
- if (currentNode->content)
- {
- NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding];
- if (currentNode->type == XML_TEXT_NODE)
- {
- if (currentNode->parent->type == XML_ELEMENT_NODE)
- {
- [parentResult setObject:currentNodeContent forKey:@"nodeContent"];
- return nil;
- }
- if (currentNode->parent->type == XML_ATTRIBUTE_NODE)
- {
- [parentResult
- setObject:
- [currentNodeContent
- stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]
- forKey:@"attributeContent"];
- return nil;
- }
- }
- }
- xmlAttr *attribute = currentNode->properties;
- if (attribute)
- {
- NSMutableArray *attributeArray = [NSMutableArray array];
- while (attribute)
- {
- NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary];
- NSString *attributeName =
- [NSString stringWithCString:(const char *)attribute->name encoding:NSUTF8StringEncoding];
- if (attributeName)
- {
- [attributeDictionary setObject:attributeName forKey:@"attributeName"];
- }
- if (attribute->children)
- {
- NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary);
- if (childDictionary)
- {
- [attributeDictionary setObject:childDictionary forKey:@"attributeContent"];
- }
- }
- if ([attributeDictionary count] > 0)
- {
- [attributeArray addObject:attributeDictionary];
- }
- attribute = attribute->next;
- }
- if ([attributeArray count] > 0)
- {
- [resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"];
- }
- }
- xmlNodePtr childNode = currentNode->children;
- if (childNode)
- {
- NSMutableArray *childContentArray = [NSMutableArray array];
- while (childNode)
- {
- NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode);
- if (childDictionary)
- {
- [childContentArray addObject:childDictionary];
- }
- childNode = childNode->next;
- }
- if ([childContentArray count] > 0)
- {
- [resultForNode setObject:childContentArray forKey:@"nodeChildArray"];
- }
- }
- return resultForNode;
- }
NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult) { NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary]; if (currentNode->name) { NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->name encoding:NSUTF8StringEncoding]; [resultForNode setObject:currentNodeContent forKey:@"nodeName"]; } if (currentNode->content) { NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]; if (currentNode->type == XML_TEXT_NODE) { if (currentNode->parent->type == XML_ELEMENT_NODE) { [parentResult setObject:currentNodeContent forKey:@"nodeContent"]; return nil; } if (currentNode->parent->type == XML_ATTRIBUTE_NODE) { [parentResult setObject: [currentNodeContent stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]] forKey:@"attributeContent"]; return nil; } } } xmlAttr *attribute = currentNode->properties; if (attribute) { NSMutableArray *attributeArray = [NSMutableArray array]; while (attribute) { NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary]; NSString *attributeName = [NSString stringWithCString:(const char *)attribute->name encoding:NSUTF8StringEncoding]; if (attributeName) { [attributeDictionary setObject:attributeName forKey:@"attributeName"]; } if (attribute->children) { NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary); if (childDictionary) { [attributeDictionary setObject:childDictionary forKey:@"attributeContent"]; } } if ([attributeDictionary count] > 0) { [attributeArray addObject:attributeDictionary]; } attribute = attribute->next; } if ([attributeArray count] > 0) { [resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"]; } } xmlNodePtr childNode = currentNode->children; if (childNode) { NSMutableArray *childContentArray = [NSMutableArray array]; while (childNode) { NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode); if (childDictionary) { [childContentArray addObject:childDictionary]; } childNode = childNode->next; } if ([childContentArray count] > 0) { [resultForNode setObject:childContentArray forKey:@"nodeChildArray"]; } } return resultForNode; }
TFHppleElement.m里加了两个key 常量
- NSString * const TFHppleNodeAttributeContentKey = @"attributeContent";
- NSString * const TFHppleNodeChildArrayKey = @"nodeChildArray";
NSString * const TFHppleNodeAttributeContentKey = @"attributeContent"; NSString * const TFHppleNodeChildArrayKey = @"nodeChildArray";
并修改获取属性方法为:
- - (NSDictionary *) attributes
- {
- NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary];
- for (NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) {
- [translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey]
- forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]];
- }
- return translatedAttributes;
- }
- (NSDictionary *) attributes { NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary]; for (NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) { [translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey] forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]]; } return translatedAttributes; }
并添加获取children node 方法:
- - (BOOL) hasChildren
- {
- NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey];
- if (childs)
- {
- return YES;
- }
- return NO;
- }
- - (NSArray *) children
- {
- if ([self hasChildren])
- return [node objectForKey: TFHppleNodeChildArrayKey];
- return nil;
- }
- (BOOL) hasChildren { NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey]; if (childs) { return YES; } return NO; } - (NSArray *) children { if ([self hasChildren]) return [node objectForKey: TFHppleNodeChildArrayKey]; return nil; }
最后我还加了一个获取所有content的主法:
- - (NSString *)contentsAt:(NSString *)xPathOrCss;
- (NSString *)contentsAt:(NSString *)xPathOrCss;
请看源码。
参看:http://giles-wang.blogspot.com/2011/08/iphoneansi.html
http://blog.csdn.net/favormm/article/details/6794487
http://blog.csdn.net/sirchenhua/article/details/7291517
http://www.cocoachina.com/newbie/basic/2011/1020/3398.html
http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090