现在的位置: 首页 > 综合 > 正文

解析 HTML

2018年01月08日 ⁄ 综合 ⁄ 共 5403字 ⁄ 字号 评论关闭

xml,json都有大量的库来解析,我们如何解析html呢?

TFHpple是一个小型的封装,可以用来解析html,它是对libxml的封装,语法是xpath。

今天我看到一个直接用libxml来解析html,参看:http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090 那张图画得一目了然,很值得收藏。这个文章中的源码不能遍历所有的html,我做了一点修改可以将html遍历打印出来

// NSData data contains the document data  
// encoding is the NSStringEncoding of the data  
// baseURL the documents base URL, i.e. location   
   
CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding);  
CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc);  
const char *enc = CFStringGetCStringPtr(cfencstr, 0);  
   
htmlDocPtr _htmlDocument = htmlReadDoc([data bytes],  
      [[baseURL absoluteString] UTF8String],  
      enc,  
      XML_PARSE_NOERROR | XML_PARSE_NOWARNING);  
if (_htmlDocument)  
{  
   xmlFreeDoc(_htmlDocument);  
}  
  
xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument;  
  
while (currentNode)   
    {  
        // output node if it is an element  
          
        if (currentNode->type == XML_ELEMENT_NODE)  
        {  
            NSMutableArray *attrArray = [NSMutableArray array];  
              
            for (xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next)  
            {  
                xmlNodePtr contents = attrNode->children;  
                  
                [attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]];  
            }  
              
            NSString *attrString = [attrArray componentsJoinedByString:@" "];   
              
            if ([attrString length])  
            {  
                attrString = [@" " stringByAppendingString:attrString];  
            }  
              
            NSLog(@"<%s%@>", currentNode->name, attrString);  
        }  
        else if (currentNode->type == XML_TEXT_NODE)  
        {  
            //NSLog(@"%s", currentNode->content);  
            NSLog(@"%@", [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]);  
        }  
        else if (currentNode->type == XML_COMMENT_NODE)  
        {  
            NSLog(@"/* %s */", currentNode->name);  
        }  
      
          
        if (currentNode && currentNode->children)  
        {  
            currentNode = currentNode->children;  
        }  
        else if (currentNode && currentNode->next)  
        {  
            currentNode = currentNode->next;  
        }  
        else  
        {  
            currentNode = currentNode->parent;  
              
            // close node  
            if (currentNode && currentNode->type == XML_ELEMENT_NODE)  
            {  
                NSLog(@"</%s>", currentNode->name);  
            }  
              
            if (currentNode->next)  
            {  
                currentNode = currentNode->next;  
            }  
            else   
            {  
                while(currentNode)  
                {  
                    currentNode = currentNode->parent;  
                    if (currentNode && currentNode->type == XML_ELEMENT_NODE)  
                    {  
                        NSLog(@"</%s>", currentNode->name);  
                        if (strcmp((const char *)currentNode->name, "table") == 0)  
                        {  
                            NSLog(@"over");  
                        }  
                    }  
                      
                    if (currentNode == nodes->nodeTab[0])  
                    {  
                        break;  
                    }  
                      
                    if (currentNode && currentNode->next)  
                    {  
                        currentNode = currentNode->next;  
                        break;  
                    }  
                }  
            }  
        }  
          
        if (currentNode == nodes->nodeTab[0])  
        {  
            break;  
        }  
    }  

不过我还是喜欢用TFHpple,因为它很简单,也好用,但是它的功能不是很完完善。比如,不能获取children node,我就写了两个方法,一个是获取children node,一个是获取所有的contents.  还有node的属性content的key与node's content的key一样,都是@"nodeContent", 正确情况下属性的应是@"attributeContent",

所以我写了这个方法,同时修改node属性的content key.

    NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult)  
    {  
        NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary];  
          
        if (currentNode->name)  
        {  
            NSString *currentNodeContent =  
            [NSString stringWithCString:(const char *)currentNode->name encoding:NSUTF8StringEncoding];  
            [resultForNode setObject:currentNodeContent forKey:@"nodeName"];  
        }  
          
        if (currentNode->content)  
        {  
            NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding];  
              
            if (currentNode->type == XML_TEXT_NODE)  
            {  
                if (currentNode->parent->type == XML_ELEMENT_NODE)  
                {  
                    [parentResult setObject:currentNodeContent forKey:@"nodeContent"];  
                    return nil;  
                }  
                  
                if (currentNode->parent->type == XML_ATTRIBUTE_NODE)  
                {  
                    [parentResult  
                     setObject:  
                     [currentNodeContent  
                      stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]  
                     forKey:@"attributeContent"];  
                    return nil;  
      
                }  
            }  
        }  
          
      
          
        xmlAttr *attribute = currentNode->properties;  
        if (attribute)  
        {  
            NSMutableArray *attributeArray = [NSMutableArray array];  
            while (attribute)  
            {  
                NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary];  
                NSString *attributeName =  
                [NSString stringWithCString:(const char *)attribute->name encoding:NSUTF8StringEncoding];  
                if (attributeName)  
                {  
                    [attributeDictionary setObject:attributeName forKey:@"attributeName"];  
                }  
                  
                if (attribute->children)  
                {  
                    NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary);  
                    if (childDictionary)  
                    {  
                        [attributeDictionary setObject:childDictionary forKey:@"attributeContent"];  
                    }  
                }  
                  
                if ([attributeDictionary count] > 0)  
                {  
                    [attributeArray addObject:attributeDictionary];  
                }  
                attribute = attribute->next;  
            }  
              
            if ([attributeArray count] > 0)  
            {  
                [resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"];  
            }  
        }  
          
        xmlNodePtr childNode = currentNode->children;  
        if (childNode)  
        {  
            NSMutableArray *childContentArray = [NSMutableArray array];  
            while (childNode)  
            {  
                NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode);  
                if (childDictionary)  
                {  
                    [childContentArray addObject:childDictionary];  
                }  
                childNode = childNode->next;  
            }  
            if ([childContentArray count] > 0)  
            {  
                [resultForNode setObject:childContentArray forKey:@"nodeChildArray"];  
            }  
        }  
          
        return resultForNode;  
    }  

TFHppleElement.m里加了两个key 常量

NSString * const TFHppleNodeAttributeContentKey  = @"attributeContent";  
NSString * const TFHppleNodeChildArrayKey        = @"nodeChildArray";  

并修改获取属性方法为:

    - (NSDictionary *) attributes  
    {  
      NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary];  
      for (NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) {  
        [translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey]  
                                 forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]];  
      }  
      return translatedAttributes;  
    }  

并添加获取children node 方法:

    - (BOOL) hasChildren  
    {  
        NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey];  
          
        if (childs)   
        {  
            return  YES;  
        }  
          
        return  NO;  
    }  
      
    - (NSArray *) children  
    {  
        if ([self hasChildren])  
            return [node objectForKey: TFHppleNodeChildArrayKey];  
        return nil;  
    }  

最后我还加了一个获取所有content的主法:

- (NSString *)contentsAt:(NSString *)xPathOrCss;  

抱歉!评论已关闭.