我正在尝试制作自己的简单网络抓取工具.我想从URL下载具有特定扩展名的文件.我写了以下代码:
- private void button1_Click(object sender,RoutedEventArgs e)
- {
- if (bw.IsBusy) return;
- bw.DoWork += new DoWorkEventHandler(bw_DoWork);
- bw.RunWorkerAsync(new string[] { URL.Text,SavePath.Text,Filter.Text });
- }
- //--------------------------------------------------------------------------------------------
- void bw_DoWork(object sender,DoWorkEventArgs e)
- {
- try
- {
- ThreadPool.SetMaxThreads(4,4);
- string[] strs = e.Argument as string[];
- Regex reg = new Regex("<a(\\s*[^>]*?){0,1}\\s*href\\s*\\=\\s*\\\"([^>]*?)\\\"\\s*[^>]*>(.*?)</a>",RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase);
- int i = 0;
- string domainS = strs[0];
- string Extensions = strs[2];
- string OutDir = strs[1];
- var domain = new Uri(domainS);
- string[] Filters = Extensions.Split(new char[] { ';',',' ' },StringSplitOptions.RemoveEmptyEntries);
- string outPath = System.IO.Path.Combine(OutDir,string.Format("File_{0}.html",i));
- WebClient webClient = new WebClient();
- string str = webClient.DownloadString(domainS);
- str = str.Replace("\r\n"," ").Replace('\n',' ');
- MatchCollection mc = reg.Matches(str);
- int NumOfThreads = mc.Count;
- Parallel.ForEach(mc.Cast<Match>(),new ParallelOptions { MaxDegreeOfParallelism = 2,},mat =>
- {
- string val = mat.Groups[2].Value;
- var link = new Uri(domain,val);
- foreach (string ext in Filters)
- if (val.EndsWith("." + ext))
- {
- Download((object)new object[] { OutDir,link });
- break;
- }
- });
- throw new Exception("Finished !");
- }
- catch (System.Exception ex)
- {
- ReportException(ex);
- }
- finally
- {
- }
- }
- //--------------------------------------------------------------------------------------------
- private static void Download(object o)
- {
- try
- {
- object[] objs = o as object[];
- Uri link = (Uri)objs[1];
- string outPath = System.IO.Path.Combine((string)objs[0],System.IO.Path.GetFileName(link.ToString()));
- if (!File.Exists(outPath))
- {
- //WebClient webClient = new WebClient();
- //webClient.DownloadFile(link,outPath);
- DownloadFile(link.ToString(),outPath);
- }
- }
- catch (System.Exception ex)
- {
- ReportException(ex);
- }
- }
- //--------------------------------------------------------------------------------------------
- private static bool DownloadFile(string url,string filePath)
- {
- try
- {
- HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
- request.UserAgent = "Web Crawler";
- request.Timeout = 40000;
- WebResponse response = request.GetResponse();
- Stream stream = response.GetResponseStream();
- using (FileStream fs = new FileStream(filePath,FileMode.CreateNew))
- {
- const int siz = 1000;
- byte[] bytes = new byte[siz];
- for (; ; )
- {
- int count = stream.Read(bytes,siz);
- fs.Write(bytes,count);
- if (count == 0) break;
- }
- fs.Flush();
- fs.Close();
- }
- }
- catch (System.Exception ex)
- {
- ReportException(ex);
- return false;
- }
- finally
- {
- }
- return true;
- }
问题是虽然它适用于2个并行下载:
- new ParallelOptions { MaxDegreeOfParallelism = 2,}
…它不适用于更大程度的并行性,如:
- new ParallelOptions { MaxDegreeOfParallelism = 5,}
…我得到连接超时异常.
起初我以为是因为WebClient:
- //WebClient webClient = new WebClient();
- //webClient.DownloadFile(link,outPath);
…但是当我用使用HttpWebRequest的函数DownloadFile替换它时,我仍然遇到错误.
我已在许多网页上测试过,没有任何改变.我还确认了chrome的扩展“Download Master”,这些Web服务器允许多个并行下载.
有没有人知道为什么我会超时尝试并行下载多个文件?
解决方法
您需要分配
ServicePointManager.DefaultConnectionLimit
.同一主机的默认并发连接是2.有关使用web.config
connectionManagement
的信息,请参阅
related SO post.