现在的位置: 首页 > 综合 > 正文

iOS解析HTML

2013年02月23日 ⁄ 综合 ⁄ 共 13684字 ⁄ 字号 评论关闭

xml,json都有大量的库来解析,我们如何解析html呢?

TFHpple是一个小型的封装,可以用来解析html,它是对libxml的封装,语法是xpath。

今天我看到一个直接用libxml来解析html,参看:http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090 那张图画得一目了然,很值得收藏。这个文章中的源码不能遍历所有的html,我做了一点修改可以将html遍历打印出来

  1.       
  2. // NSData data contains the document data
      
  3. // encoding is the NSStringEncoding of the data
      
  4. // baseURL the documents base URL, i.e. location 
      
  5.    
  6. CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding);  
  7. CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc);  
  8. const char *enc = CFStringGetCStringPtr(cfencstr, 0);  
  9.    
  10. htmlDocPtr _htmlDocument = htmlReadDoc([data bytes],  
  11.       [[baseURL absoluteString] UTF8String],  
  12.       enc,  
  13.       XML_PARSE_NOERROR | XML_PARSE_NOWARNING);  
  14. if (_htmlDocument)  
  15. {  
  16.    xmlFreeDoc(_htmlDocument);  
  17. }  
  18.   
  19. xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument;  
  20.   
  21. while (currentNode)   
  22.     {  
  23.         // output node if it is an element
      
  24.           
  25.         if (currentNode->type == XML_ELEMENT_NODE)  
  26.         {  
  27.             NSMutableArray *attrArray = [NSMutableArray array];  
  28.               
  29.             for (xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next)  
  30.             {  
  31.                 xmlNodePtr contents = attrNode->children;  
  32.                   
  33.                 [attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]];  
  34.             }  
  35.               
  36.             NSString *attrString = [attrArray componentsJoinedByString:@" "];   
  37.               
  38.             if ([attrString length])  
  39.             {  
  40.                 attrString = [@" " stringByAppendingString:attrString];  
  41.             }  
  42.               
  43.             NSLog(@"<%s%@>", currentNode->name, attrString);  
  44.         }  
  45.         else if (currentNode->type == XML_TEXT_NODE)  
  46.         {  
  47.             //NSLog(@"%s", currentNode->content);
      
  48.             NSLog(@"%@", [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]);  
  49.         }  
  50.         else if (currentNode->type == XML_COMMENT_NODE)  
  51.         {  
  52.             NSLog(@"/* %s */", currentNode->name);  
  53.         }  
  54.       
  55.           
  56.         if (currentNode && currentNode->children)  
  57.         {  
  58.             currentNode = currentNode->children;  
  59.         }  
  60.         else if (currentNode && currentNode->next)  
  61.         {  
  62.             currentNode = currentNode->next;  
  63.         }  
  64.         else  
  65.         {  
  66.             currentNode = currentNode->parent;  
  67.               
  68.             // close node   
  69.             if (currentNode && currentNode->type == XML_ELEMENT_NODE)  
  70.             {  
  71.                 NSLog(@"</%s>", currentNode->name);  
  72.             }  
  73.               
  74.             if (currentNode->next)  
  75.             {  
  76.                 currentNode = currentNode->next;  
  77.             }  
  78.             else   
  79.             {  
  80.                 while(currentNode)  
  81.                 {  
  82.                     currentNode = currentNode->parent;  
  83.                     if (currentNode && currentNode->type == XML_ELEMENT_NODE)  
  84.                     {  
  85.                         NSLog(@"</%s>", currentNode->name);  
  86.                         if (strcmp((const char *)currentNode->name, "table") == 0)  
  87.                         {  
  88.                             NSLog(@"over");  
  89.                         }  
  90.                     }  
  91.                       
  92.                     if (currentNode == nodes->nodeTab[0])  
  93.                     {  
  94.                         break;  
  95.                     }  
  96.                       
  97.                     if (currentNode && currentNode->next)  
  98.                     {  
  99.                         currentNode = currentNode->next;  
  100.                         break;  
  101.                     }  
  102.                 }  
  103.             }  
  104.         }  
  105.           
  106.         if (currentNode == nodes->nodeTab[0])  
  107.         {  
  108.             break;  
  109.         }  
  110.     }  

不过我还是喜欢用TFHpple,因为它很简单,也好用,但是它的功能不是很完完善。比如,不能获取children node,我就写了两个方法,一个是获取children node,一个是获取所有的contents.  还有node的属性content的key与node's content的key一样,都是@"nodeContent", 正确情况下属性的应是@"attributeContent",

所以我写了这个方法,同时修改node属性的content key.

  1. NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult)  
  2. {  
  3.     NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary];  
  4.       
  5.     if (currentNode->name)  
  6.     {  
  7.         NSString *currentNodeContent =  
  8.         [NSString stringWithCString:(const char *)currentNode->name encoding:NSUTF8StringEncoding];  
  9.         [resultForNode setObject:currentNodeContent forKey:@"nodeName"];  
  10.     }  
  11.       
  12.     if (currentNode->content)  
  13.     {  
  14.         NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding];  
  15.           
  16.         if (currentNode->type == XML_TEXT_NODE)  
  17.         {  
  18.             if (currentNode->parent->type == XML_ELEMENT_NODE)  
  19.             {  
  20.                 [parentResult setObject:currentNodeContent forKey:@"nodeContent"];  
  21.                 return nil;  
  22.             }  
  23.               
  24.             if (currentNode->parent->type == XML_ATTRIBUTE_NODE)  
  25.             {  
  26.                 [parentResult  
  27.                  setObject:  
  28.                  [currentNodeContent  
  29.                   stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]  
  30.                  forKey:@"attributeContent"];  
  31.                 return nil;  
  32.   
  33.             }  
  34.         }  
  35.     }  
  36.       
  37.   
  38.       
  39.     xmlAttr *attribute = currentNode->properties;  
  40.     if (attribute)  
  41.     {  
  42.         NSMutableArray *attributeArray = [NSMutableArray array];  
  43.         while (attribute)  
  44.         {  
  45.             NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary];  
  46.             NSString *attributeName =  
  47.             [NSString stringWithCString:(const char *)attribute->name encoding:NSUTF8StringEncoding];  
  48.             if (attributeName)  
  49.             {  
  50.                 [attributeDictionary setObject:attributeName forKey:@"attributeName"];  
  51.             }  
  52.               
  53.             if (attribute->children)  
  54.             {  
  55.                 NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary);  
  56.                 if (childDictionary)  
  57.                 {  
  58.                     [attributeDictionary setObject:childDictionary forKey:@"attributeContent"];  
  59.                 }  
  60.             }  
  61.               
  62.             if ([attributeDictionary count] > 0)  
  63.             {  
  64.                 [attributeArray addObject:attributeDictionary];  
  65.             }  
  66.             attribute = attribute->next;  
  67.         }  
  68.           
  69.         if ([attributeArray count] > 0)  
  70.         {  
  71.             [resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"];  
  72.         }  
  73.     }  
  74.       
  75.     xmlNodePtr childNode = currentNode->children;  
  76.     if (childNode)  
  77.     {  
  78.         NSMutableArray *childContentArray = [NSMutableArray array];  
  79.         while (childNode)  
  80.         {  
  81.             NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode);  
  82.             if (childDictionary)  
  83.             {  
  84.                 [childContentArray addObject:childDictionary];  
  85.             }  
  86.             childNode = childNode->next;  
  87.         }  
  88.         if ([childContentArray count] > 0)  
  89.         {  
  90.             [resultForNode setObject:childContentArray forKey:@"nodeChildArray"];  
  91.         }  
  92.     }  
  93.       
  94.     return resultForNode;  
  95. }  

TFHppleElement.m里加了两个key 常量

  1. NSString * const TFHppleNodeAttributeContentKey  = @"attributeContent";  
  2. NSString * const TFHppleNodeChildArrayKey        = @"nodeChildArray";  

并修改获取属性方法为:

  1. - (NSDictionary *) attributes  
  2. {  
  3.   NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary];  
  4.   for (NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) {  
  5.     [translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey]  
  6.                              forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]];  
  7.   }  
  8.   return translatedAttributes;  
  9. }  

并添加获取children node 方法:

  1. - (BOOL) hasChildren  
  2. {  
  3.     NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey];  
  4.       
  5.     if (childs)   
  6.     {  
  7.         return  YES;  
  8.     }  
  9.       
  10.     return  NO;  
  11. }  
  12.   
  13. - (NSArray *) children  
  14. {  
  15.     if ([self hasChildren])  
  16.         return [node objectForKey: TFHppleNodeChildArrayKey];  
  17.     return nil;  
  18. }  

最后我还加了一个获取所有content的主法:

  1. - (NSString *)contentsAt:(NSString *)xPathOrCss;  

请看源码

参看:http://giles-wang.blogspot.com/2011/08/iphoneansi.html

http://blog.csdn.net/favormm/article/details/6794487

http://blog.csdn.net/sirchenhua/article/details/7291517

http://www.cocoachina.com/newbie/basic/2011/1020/3398.html

http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090

抱歉!评论已关闭.