现在的位置: 首页 > 综合 > 正文

用 DOM 实现文章采集 — 通过jquery 语法式的方法采集指定对象的文本。

2014年02月17日 ⁄ 综合 ⁄ 共 5924字 ⁄ 字号 评论关闭
    /// <summary>
    /// DOM查询器,用法跟jquery差不多
    /// </summary>
    public class DomQuery
    {
        /// <summary>
        /// 获得节点
        /// </summary>
        /// <param name="_HtmlDocument"></param>
        /// <param name="selector"></param>
        /// <returns></returns>
        /// <remarks>DOM选择器,用法跟jquery差不多</remarks>
        public IList<HtmlNode> Get(HtmlDocument _HtmlDocument, string selector)
        {
            string[] Expressions = selector.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

            List<HtmlNode> hnList = new List<HtmlNode>();

            if (Expressions[0].StartsWith("#"))
            {
                hnList.Add(_HtmlDocument.GetElementbyId(Expressions[0].TrimStart('#')));
                hnList.RemoveAll(x => { return x == null; });

                if (Expressions.Length == 1)
                {
                    return hnList;
                }

                for (int i = 1; i < Expressions.Length; i++)
                {
                    hnList = Get(hnList, Expressions[i]);
                }
            }
            else
            {
                hnList.AddRange(_HtmlDocument.DocumentNode.ChildNodes.Where(x => { return x.NodeType == HtmlNodeType.Element; }));

                for (int i = 0; i < Expressions.Length; i++)
                {
                    hnList = Get(hnList, Expressions[i]);
                }
            }





            return hnList;
        }
        /// <summary>
        /// 查找节点,并直接返回InnerHtml
        /// </summary>
        /// <param name="_HtmlDocument"></param>
        /// <param name="selector"></param>
        /// <returns></returns>
        public string SingleGetInnerHtml(HtmlDocument _HtmlDocument, string selector)
        {
            HtmlNode hn = SingleGet(_HtmlDocument, selector);
            if (hn == null)
                return null;
            else
                return hn.InnerHtml;
        }
        /// <summary>
        /// 查找节点,并直接返回InnerText
        /// </summary>
        /// <param name="_HtmlDocument"></param>
        /// <param name="selector"></param>
        /// <returns></returns>
        public string SingleGetInnerText(HtmlDocument _HtmlDocument, string selector)
        {
            HtmlNode hn = SingleGet(_HtmlDocument, selector);
            if (hn == null)
                return null;
            else
                return hn.InnerText.Trim();
        }
        /// <summary>
        /// 查找节点
        /// </summary>
        /// <param name="_HtmlDocument"></param>
        /// <param name="selector"></param>
        /// <returns></returns>
        public HtmlNode SingleGet(HtmlDocument _HtmlDocument, string selector)
        {
            IList<HtmlNode> hnList = Get(_HtmlDocument, selector);

            if (hnList.Count == 0)
            {
                return null;
            }
            else
            {
                return hnList[0];
            }
        }

        #region 获得属性
        /// <summary>
        /// 获得属性
        /// </summary>
        /// <param name="_HtmlNodes"></param>
        /// <param name="attr"></param>
        /// <returns></returns>
        public string[] Attr(IList<HtmlNode> _HtmlNodes, string attr)
        {
            if (_HtmlNodes == null)
            {
                return new string[0];
            }
            if (_HtmlNodes.Count() == 0)
            {
                return new string[0];
            }
            var v = from x in _HtmlNodes where x.Attributes[attr] != null select x;

            return (from x in v select x.Attributes[attr].Value).ToArray();
        }
        #endregion

        #region 根据选择器语法查找
        /// <summary>
        /// 根据选择器语法查找
        /// </summary>
        /// <param name="_HtmlNodes"></param>
        /// <param name="Expression"></param>
        /// <returns></returns>
        private List<HtmlNode> Get(List<HtmlNode> _HtmlNodes, string Expression)
        {
            string _expre = null;
            string fun = null;
            int index = -1;
            string keyword = null;
            Regex reg = new Regex(@"([.|\-|\w]+)", RegexOptions.Singleline);
            MatchCollection mc = reg.Matches(Expression);
            for (int i = 0; i < mc.Count; i++)
            {
                if (i == 0)
                {
                    _expre = mc[i].Value;
                }
                if (i == 1)
                {
                    fun = mc[i].Value;
                }
                if (i == 2)
                {
                    if (int.TryParse(mc[i].Value, out index) == false)
                    {
                        keyword = mc[i].Value;
                    }
                }
            }
            List<HtmlNode> list = new List<HtmlNode>();

            if (string.IsNullOrEmpty(fun) == true)
            {
                if (Expression.StartsWith("."))
                {
                    return Class(_HtmlNodes, Expression).ToList();
                }
                else
                {
                    return NodeType(_HtmlNodes, Expression).ToList();
                }
            }
            else
            {
                foreach (var n in _HtmlNodes)
                {
                    IEnumerable<HtmlNode> v;
                    if (_expre.StartsWith("."))
                    {
                        v = Class(n, _expre);
                    }
                    else
                    {
                        v = NodeType(n, _expre);
                    }


                    list.AddRange(FunAction(v, fun, index, keyword));
                }
                return list;
            }
        }
        #region 函数处理
        /// <summary>
        /// 函数处理
        /// </summary>
        /// <param name="v"></param>
        /// <param name="fun"></param>
        /// <returns></returns>
        private IEnumerable<HtmlNode> FunAction(IEnumerable<HtmlNode> v, string fun, int index, string keyword)
        {
            switch (fun.ToLower())
            {
                case "eq":
                    return v.Where((nn, _index) => _index == index);
                case "lt":
                    return v.Where((nn, _index) => _index < index);
                case "gt":
                    return v.Where((nn, _index) => _index > index);
                case "first":
                    if (v.Count() > 0)
                        return new HtmlNode[] { v.First() };
                    else
                        return v;
                case "last":
                    if (v.Count() > 0)
                        return new HtmlNode[] { v.Last() };
                    else
                        return v;
                case "even":
                    return v.Where((nn, _index) => _index % 2 == 0);
                case "odd":
                    return v.Where((nn, _index) => (_index & 1) == 1);
                case "next":
                    return v.Select(nn => nn.NextSibling);
                case "contains":
                    return v.Where(x => { return x.InnerHtml.Contains(keyword); });
                case "empty":
                    return v.Where(x => { return x.HasChildNodes == false; });
                case "header":
                    string[] headers = new string[] { "h1", "h2", "h3", "h4", "h5", "h6" };
                    return FindChildNodes(v.ToArray()).Where(x => { return headers.Contains(x.OriginalName); });
                default:
                    throw new NotSupportedException("函数不支持。");
            }
        }
        #endregion
        #endregion

        #region 根据类名找节点
        private ParallelQuery<HtmlNode> Class(HtmlNode hn, string Expression)
        {
            return Class(new HtmlNode[] { hn }, Expression);
        }
        /// <summary>
        /// 根据类名找节点
        /// </summary>
        /// <param name="_HtmlNodes"></param>
        /// <param name="Expression"></param>
        /// <returns></returns>
        private ParallelQuery<HtmlNode> Class(IList<HtmlNode> _HtmlNodes, string Expression)
        {
            var v = FindChildNodes(_HtmlNodes).AsParallel().Where(x => x.Attributes["class"] != null);

            var Y = v.Where(x => x.Attributes["class"].Value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Contains(Expression.TrimStart('.'), StringComparer.CurrentCultureIgnoreCase));

            return Y;
        }
        #endregion

        #region 根据类型找节点
        /// <summary>
        /// 根据类型找节点
        /// </summary>
        /// <param name="hn"></param>
        /// <param name="Expression"></param>
        /// <returns></returns>
        private ParallelQuery<HtmlNode> NodeType(HtmlNode hn, string Expression)
        {
            return NodeType(new HtmlNode[] { hn }, Expression);
        }
        /// <summary>
        /// 根据类型找节点
        /// </summary>
        /// <param name="_HtmlNodes"></param>
        /// <param name="Expression"></param>
        /// <returns></returns>
        private ParallelQuery<HtmlNode> NodeType(IList<HtmlNode> _HtmlNodes, string Expression)
        {
            var v = FindChildNodes(_HtmlNodes).AsParallel().Where(
                     x => x.OriginalName.Equals(Expression, StringComparison.CurrentCultureIgnoreCase));


            return v;
        }
        #endregion

        #region 查找所有下级
        /// <summary>
        /// 查找所有下级
        /// </summary>
        /// <param name="_HtmlNodes"></param>
        /// <returns></returns>
        private List<HtmlNode> FindChildNodes(IList<HtmlNode> _HtmlNodes)
        {
            if (_HtmlNodes == null)
            {
                throw new Exception("");
            }
            List<HtmlNode> list = new List<HtmlNode>();
            foreach (var v in _HtmlNodes)
            {
                FindChildNodesAction(v, list);
            }

            return list;
        }
        private void FindChildNodesAction(HtmlNode hn, List<HtmlNode> list)
        {
            if (list == null)
            {
                throw new Exception("");
            }
            foreach (var v in hn.ChildNodes)
            {
                if (hn.NodeType == HtmlNodeType.Element)
                {
                    list.Add(v);
                    FindChildNodesAction(v, list);
                }
            }
        }

        #endregion


    }

抱歉!评论已关闭.