之前一直在关注句子迷这个网站,在里面可以找到自己喜欢的名人的金典语录。
每一次都去打开,再去翻页查看太烦,有没有什么方法可以把网站中的句子直接抓出来,保存到本地txt文件中?
然后在网上查看了一些资料,自己动手写了一个控制台应用程序。
注:代码存在点问题,多次发起请求,线程会被占用,目前没有解决;求救中
具体实现方式:
步骤1:通过HttpWebResponse发送一个请求,将整个HTML页面请求过来,将全部数据读入StreamReader中,下图@R_404_161@
步骤2:将读入流中的数据进行处理,只取包含句子的文本,这里面需要查询页面的中html。
在取文本的过程中,使用正则表达式,取出想要的数据
- ///正则表达式
- Regex regText = new Regex(@"<div\s+class\=\""views-field-PHPcode-1\"">([\S\s]*?)</div>",RegexOptions.IgnoreCase);
- Regex objRegExp = new Regex("<(.|\n)+?>");
具体代码如下:
- class Program
- {
- static ReaderWriterLock writeLock = new ReaderWriterLock();
- const int LOCK = 1000; //申请读写时间
- const int SLEEP = 100; //线程挂起时间
- static void Main(string[] args)
- {
- Console.WriteLine("-------------------句子迷文档下载----------");
- Console.WriteLine("操作API:");
- Console.WriteLine("注释1:查询的的作者名,以逗号(英文)隔开,例子如下:");
- Console.WriteLine(" 鲁迅,胡适,顾城");
-
- Console.WriteLine("注释2:保存的盘符,例子如下:");
- Console.WriteLine(" X:\\\\句子迷 ");
- Console.WriteLine("-------------------文档结束----------");
- Console.WriteLine("请输入需要保存的盘:");
- directoryName1 = Console.ReadLine();
-
- Console.WriteLine("请输入作者姓名:");
- string writers = Console.ReadLine();
- string[] strWriter = writers.Split(',');
- //string[] strWriter = { "鲁迅","胡适","belle","stars" };
- for (int i = 0; i < strWriter.Length; i++)
- {
- Thread thread = new Thread(DownLoad1);
- thread.Name = strWriter[i];
- thread.Start();
- }
-
- Console.ReadKey();
-
-
- }
-
- #region 通过网页获取句子迷中的名言名句
- /// <summary>
- /// 通过网页获取句子迷中的名言名句
- /// </summary>
- /// <param name="category"></param>
- public static void DownLoad1()
- {
- string url = string.Empty;
- bool flag = true;//遇到取文件异常就直接跳出
-
- for (int pageSize = 0; pageSize < 10; pageSize++)
- {
- try
- {
- string writerName = Thread.CurrentThread.Name; ;
- //查询作者列子
- //if (pageSize == 0)
- // url = "http://www.juzimi.com/writer/徐志摩";
- //else
- // url = "http://www.juzimi.com/writer/徐志摩?page=" + pageSize;
-
- //第一版查询功能
- //if (pageSize == 0)
- // url = "http://www.juzimi.com/writer/"+writerName;
- //else
- // url = "http://www.juzimi.com/writer/" + writerName + "?page=" + pageSize;
- if (pageSize == 0)
- url = "http://www.juzimi.com/search/node/" + writerName + "%20type:sentence";
- else
- url = "http://www.juzimi.com/search/node/" + writerName + "%20type%3Asentence?page=" + pageSize;
- //创建http链接
- var request = (HttpWebRequest)WebRequest.Create(url);
- //request.Timeout = 1000 * 10; //5s过期
- var response = (HttpWebResponse)request.GetResponse();
- Stream stream = response.GetResponseStream();
- StreamReader sr = new StreamReader(stream);
- string content = sr.ReadToEnd();
- var list = GetHtmlTextList(content);
- if (list.Count == 0)
- {
- Console.WriteLine("时间:" + DateTime.Now + " 当前网址:" + url + ":未找到相关信息;" + "当前线程:" + Thread.CurrentThread.ManagedThreadId);
- flag = false;
- break;
- }
- try
- {
- //文件保存文件夹
- string directoryName = string.IsNullOrEmpty(directoryName1) ? "X:\\句子迷" : directoryName1;
- //文件名
- string fileName = writerName;
- Write(directoryName,fileName,list);
- Console.WriteLine("时间:" + DateTime.Now + " 当前网址:" + url + "句子信息下载完成!" + "当前线程:" + Thread.CurrentThread.ManagedThreadId);
- }
- catch (Exception e)
- {
- Console.WriteLine("时间:" + DateTime.Now + " 当前网址:" + url + " 错误信息:" + e.Message + "当前线程:" + Thread.CurrentThread.ManagedThreadId);
- continue;
- }
- }
- catch (Exception ex)
- {
- if (ex.ToString().Contains("404"))
- {
- Console.WriteLine("时间:" + DateTime.Now + " 当前网址:" + url + " 错误信息:" + ex.Message + "当前线程:" + Thread.CurrentThread.ManagedThreadId);
- flag = false;
- break;
- }
- else
- {
- Console.WriteLine("时间:" + DateTime.Now + " 当前网址:" + url + " 错误信息:" + ex.Message + "当前线程:" + Thread.CurrentThread.ManagedThreadId);
- }
- }
-
- if (!flag)
- {
- break;
- }
-
- }
- }
-
- /// <summary>
- /// 保存句子迷中的句子
- /// </summary>
- /// <param name="path">保存路劲地址</param>
- /// /// <param name="path">句子迷中的句子</param>
- public static void Write(string path,string fileName,List<string> strBook)
- {
-
- writeLock.AcquireWriterLock(LOCK);
- FileStream fs = null;
- //判断文件夹是否存在
- if (!Directory.Exists(path))
- Directory.CreateDirectory(path);
- //判断集体文件保存路径是否存在
- string filePalce = path + "\\" + fileName + ".txt";
- if (!File.Exists(filePalce))
- {
- File.Create(filePalce);
- fs = new FileStream(filePalce,FileMode.Create);
- }
- else
- {
- fs = new FileStream(filePalce,FileMode.Append);
- }
-
- StreamWriter sw = new StreamWriter(fs);
- foreach (var item in strBook)
- {
- //开始写入
- sw.Write(item + "\r\n\r\n");
- }
- Thread.Sleep(SLEEP);
- //清空缓冲区
- sw.Flush();
- //关闭流
- sw.Close();
- fs.Close();
- writeLock.ReleaseWriterLock();
-
- }
-
-
- /// <summary>
- /// 取得HTML中所有图片的 URL。
- /// </summary>
- /// <param name="sHtmlText">@R_404_161@</param>
- /// <returns>图片的URL列表</returns>
- public static List<string> GetHtmlTextList(string sHtmlText)
- {
- // 定义正则表达式用来匹配 text 标签
- Regex regText = new Regex(@"<div\s+class\=\""views-field-PHPcode-1\"">([\S\s]*?)</div>",RegexOptions.IgnoreCase);
-
- // 搜索匹配的字符串
- MatchCollection matches = regText.Matches(sHtmlText);
-
- List<string> sUrlList = new List<string>();
-
- // 取得匹配项列表
- foreach (Match match in matches)
- {
- sUrlList.Add(replceHtml(match.Value));
- }
- return sUrlList;
- }
-
- /// <summary>
- /// 将取出来的含有html的标签替换掉,只留下里面的值
- /// </summary>
- /// <param name="strHtml"></param>
- /// <returns></returns>
- public static string replceHtml(string strHtml)
- {
- Regex objRegExp = new Regex("<(.|\n)+?>");
- return objRegExp.Replace(strHtml,"");
- }
- #endregion