现在的位置: 首页 > 综合 > 正文

迅读网小说下载器

2014年07月06日 ⁄ 综合 ⁄ 共 4024字 ⁄ 字号 评论关闭

迅读网小说下载器

    下面的程序是用 FSharp 写。

    编译命令:

   

fsc --optimize- --noframework --standalone -o:downloadNovel.exe downloadNovel.fsx

    或者:

fsc downloadNovel.fsx --optimize+ --noframework --standalone --tailcalls- --platform:x86 -r:"System.Core.dll" -r:"System.dll"

    使用方法:

downloadNovel http://www.xunlook.com/article/2e8446dd-c763-4346-9501-ac4597ee3712/index.shtmlhttp://www.xunlook.com/article/9fe3a0fa-1a9f-4535-9348-7d323e878933/index.shtml

    想并行下载,总时出错,未果。

========================

#light
//#nowarn "57"
open System;;
open System.IO;;
open System.Text;;
open System.Text.RegularExpressions;;
open System.Net;;
#if INTERACTIVE
//#r @"C:\Program Files\FSharpPowerPack-2.0.0.0\bin\FSharp.PowerPack.dll";;
//#r @"C:\Program Files\FSharpPowerPack-2.0.0.0\bin\FSharp.PowerPack.Parallel.Seq.dll";;
#endif
//open Microsoft.FSharp.Control.WebExtensions;;
//open Microsoft.FSharp.Collections;;

let downloadNovel (url:string) =

  //let url = @"http://www.xunlook.com/article/d51ddcd6-6b0c-469a-9dd1-4f061c64a03b/index.shtml"
  let urlBase = url.Substring(0, url.LastIndexOf(@"/") + 1 )

//  let downloadUrlAsync (url:string) = async{
//    let req = WebRequest.Create(url)
//    let! rsp = req.AsyncGetResponse()
//    use rst = rsp.GetResponseStream()
//    use reader = new StreamReader( rst, Encoding.GetEncoding("GB2312") )
//    let! str = reader.AsyncReadToEnd()
//    return str
//    }

  let downloadUrl (url:string) =
    let req = HttpWebRequest.Create(url)
    let rsp = req.GetResponse()
    use rst = rsp.GetResponseStream()
    use reader = new StreamReader( rst, Encoding.GetEncoding("GB2312") )
    let str = reader.ReadToEnd()
    str
    

  let writeToFile filename obj =
    let fp = Environment.CurrentDirectory + "\\" + filename
    File.AppendAllText( fp, obj, Encoding.Default)
    
  let stream = downloadUrl url
  //let stream = downloadUrlAsync url
  //             |> Async.RunSynchronously

  let rxTit = new Regex(@"<title>([^\b]+)</title>",
                            RegexOptions.IgnoreCase ||| RegexOptions.Singleline)
  let rxAut = new Regex(@"<meta name=""author"" content=""([^\b]+?)"" />",
                            RegexOptions.IgnoreCase ||| RegexOptions.Singleline)

  let rxCat = new Regex(@"\<div id=""content_1""\>([^\b]+)\</div\>",
                            RegexOptions.IgnoreCase ||| RegexOptions.Singleline)
  let rxXXX = new Regex(@"<a href='([\d]+.shtml)'.*?>",
                            RegexOptions.IgnoreCase ||| RegexOptions.Singleline)
  let rxCon = new Regex(@"\<div id=""content_1""\>([^\b]+?)(<div id='nav_1'>|\</div>)",
                            RegexOptions.IgnoreCase ||| RegexOptions.Singleline)
  let title =
    let t = rxTit.Match( stream ).Groups.[1].Value
            + "-作者:" +
            rxAut.Match( stream ).Groups.[1].Value
    t.Replace("&nbsp;", "-").Replace(" ", "")
     .Replace("\r","").Replace("\n","")
     .Replace(";","").Replace(":","").Replace(".","").Replace(",","")
     .Replace("+","").Replace("?","").Replace("@","").Replace("<","").Replace(">","")
     .Replace("|","").Replace(@"/","").Replace(@"\","").Replace("_","")
     .Replace("迅读网","").Replace("天涯整理版","")
     |> fun l -> l.Substring(0, System.Math.Min(l.Length, 100))
    //|>  printfn "%s"
    //"a"

            
  let content s =
    let t = rxCon.Match( s ).Groups.[1].Value
    t.Replace("<br>","\r\n").Replace("<br />","\r\n")
     .Replace("&#176;","度").Replace("&amp;#8226;","·").Replace("&#183;","·")
     .Replace("&nbsp;"," ")
    |> writeToFile ( title + ".txt" )

  [for i in rxCat.Matches( stream ) do
    let t0 = i.Groups.[1].Value
    
    yield //t0
      [ for i in rxXXX.Matches( t0 : string ) do
        let t1 = i.Groups.[1].Value
        
        yield t1 ] |> Seq.map ( fun i -> urlBase + i)
                   |> Seq.iter( fun i -> (downloadUrl i) |> content )
//                   |> Seq.iter( fun i -> (downloadUrlAsync i)|> Async.RunSynchronously|> content )
                                         
                                         
  ] |> ignore
   
[<EntryPoint>]
let main args =

  match args with
  | [||]    ->  printfn "%s" ( "迅读网小说下载器。用法:\r\n"
                + "downloadNovel httpAddress\r\n"
                + "httpAddress 为某一篇小说的目录地址。比如:一部让你笑得蛋疼的唐朝全史\r\n"
                + "http://www.xunlook.com/article/d51ddcd6-6b0c-469a-9dd1-4f061c64a03b/index.shtml") ;
                exit 0
  | address -> address |> Seq.iter (fun i -> (*printfn "%s" i *)downloadNovel i ); printfn "%s" "下载结束。"

  0

抱歉!评论已关闭.