迅读网小说下载器
下面的程序是用 FSharp 写。
编译命令:
fsc --optimize- --noframework --standalone -o:downloadNovel.exe downloadNovel.fsx
或者:
fsc downloadNovel.fsx --optimize+ --noframework --standalone --tailcalls- --platform:x86 -r:"System.Core.dll" -r:"System.dll"
使用方法:
downloadNovel http://www.xunlook.com/article/2e8446dd-c763-4346-9501-ac4597ee3712/index.shtmlhttp://www.xunlook.com/article/9fe3a0fa-1a9f-4535-9348-7d323e878933/index.shtml
想并行下载,总时出错,未果。
========================
#light
//#nowarn "57"
open System;;
open System.IO;;
open System.Text;;
open System.Text.RegularExpressions;;
open System.Net;;
#if INTERACTIVE
//#r @"C:\Program Files\FSharpPowerPack-2.0.0.0\bin\FSharp.PowerPack.dll";;
//#r @"C:\Program Files\FSharpPowerPack-2.0.0.0\bin\FSharp.PowerPack.Parallel.Seq.dll";;
#endif
//open Microsoft.FSharp.Control.WebExtensions;;
//open Microsoft.FSharp.Collections;;
let downloadNovel (url:string) =
//let url = @"http://www.xunlook.com/article/d51ddcd6-6b0c-469a-9dd1-4f061c64a03b/index.shtml"
let urlBase = url.Substring(0, url.LastIndexOf(@"/") + 1 )
// let downloadUrlAsync (url:string) = async{
// let req = WebRequest.Create(url)
// let! rsp = req.AsyncGetResponse()
// use rst = rsp.GetResponseStream()
// use reader = new StreamReader( rst, Encoding.GetEncoding("GB2312") )
// let! str = reader.AsyncReadToEnd()
// return str
// }
let downloadUrl (url:string) =
let req = HttpWebRequest.Create(url)
let rsp = req.GetResponse()
use rst = rsp.GetResponseStream()
use reader = new StreamReader( rst, Encoding.GetEncoding("GB2312") )
let str = reader.ReadToEnd()
str
let writeToFile filename obj =
let fp = Environment.CurrentDirectory + "\\" + filename
File.AppendAllText( fp, obj, Encoding.Default)
let stream = downloadUrl url
//let stream = downloadUrlAsync url
// |> Async.RunSynchronously
let rxTit = new Regex(@"<title>([^\b]+)</title>",
RegexOptions.IgnoreCase ||| RegexOptions.Singleline)
let rxAut = new Regex(@"<meta name=""author"" content=""([^\b]+?)"" />",
RegexOptions.IgnoreCase ||| RegexOptions.Singleline)
let rxCat = new Regex(@"\<div id=""content_1""\>([^\b]+)\</div\>",
RegexOptions.IgnoreCase ||| RegexOptions.Singleline)
let rxXXX = new Regex(@"<a href='([\d]+.shtml)'.*?>",
RegexOptions.IgnoreCase ||| RegexOptions.Singleline)
let rxCon = new Regex(@"\<div id=""content_1""\>([^\b]+?)(<div id='nav_1'>|\</div>)",
RegexOptions.IgnoreCase ||| RegexOptions.Singleline)
let title =
let t = rxTit.Match( stream ).Groups.[1].Value
+ "-作者:" +
rxAut.Match( stream ).Groups.[1].Value
t.Replace(" ", "-").Replace(" ", "")
.Replace("\r","").Replace("\n","")
.Replace(";","").Replace(":","").Replace(".","").Replace(",","")
.Replace("+","").Replace("?","").Replace("@","").Replace("<","").Replace(">","")
.Replace("|","").Replace(@"/","").Replace(@"\","").Replace("_","")
.Replace("迅读网","").Replace("天涯整理版","")
|> fun l -> l.Substring(0, System.Math.Min(l.Length, 100))
//|> printfn "%s"
//"a"
let content s =
let t = rxCon.Match( s ).Groups.[1].Value
t.Replace("<br>","\r\n").Replace("<br />","\r\n")
.Replace("°","度").Replace("&#8226;","·").Replace("·","·")
.Replace(" "," ")
|> writeToFile ( title + ".txt" )
[for i in rxCat.Matches( stream ) do
let t0 = i.Groups.[1].Value
yield //t0
[ for i in rxXXX.Matches( t0 : string ) do
let t1 = i.Groups.[1].Value
yield t1 ] |> Seq.map ( fun i -> urlBase + i)
|> Seq.iter( fun i -> (downloadUrl i) |> content )
// |> Seq.iter( fun i -> (downloadUrlAsync i)|> Async.RunSynchronously|> content )
] |> ignore
[<EntryPoint>]
let main args =
match args with
| [||] -> printfn "%s" ( "迅读网小说下载器。用法:\r\n"
+ "downloadNovel httpAddress\r\n"
+ "httpAddress 为某一篇小说的目录地址。比如:一部让你笑得蛋疼的唐朝全史\r\n"
+ "http://www.xunlook.com/article/d51ddcd6-6b0c-469a-9dd1-4f061c64a03b/index.shtml") ;
exit 0
| address -> address |> Seq.iter (fun i -> (*printfn "%s" i *)downloadNovel i ); printfn "%s" "下载结束。"
0