ZC_FootBallNewsJob.cs 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. using FCS.Common;
  2. using FCS.Crawler.Tools;
  3. using FCS.Interface;
  4. using FCS.Models;
  5. using HtmlAgilityPack;
  6. using Newtonsoft.Json;
  7. using Quartz;
  8. using System;
  9. using System.Collections.Generic;
  10. using System.Data;
  11. using System.Linq;
  12. using System.Text;
  13. using System.Text.RegularExpressions;
  14. namespace FCS.Crawler.ZCLotteryNews
  15. {
  16. public class ZC_FootBallNewsJob : CommonJob, IJob
  17. {
  18. public ZC_FootBallNewsJob()
  19. {
  20. log = new LogHelper();
  21. services = IOC.Resolve<IDTOpenCode>();
  22. }
  23. public void Execute(IJobExecutionContext context)
  24. {
  25. Config = CommonHelper.GetConfigFromDataMap(context.JobDetail.JobDataMap);
  26. GetAll();
  27. }
  28. /// <summary>
  29. /// 执行主站技巧
  30. /// </summary>
  31. public void GetAll()
  32. {
  33. List<string> urls = new List<string> { "http://sports.163.com/zc/" };
  34. foreach (string url in urls)
  35. {
  36. //获取新闻列表,
  37. List<Base_News> yc_news = GetOpenListFromMainUrl(url);
  38. foreach (var newItem in yc_news)
  39. {
  40. services.AddNews(currentNews, newItem);
  41. }
  42. }
  43. }
  44. /// <summary>
  45. /// 爬取网易的新闻列表
  46. /// </summary>
  47. /// <param name="mainUrl"></param>
  48. /// <returns></returns>
  49. private List<Base_News> GetOpenListFromMainUrl(string mainUrl)
  50. {
  51. var result = new List<Base_News>();
  52. try
  53. {
  54. var url = new Uri(mainUrl);
  55. var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("gbk"));
  56. if (htmlResource == null) return result;
  57. HtmlDocument doc = new HtmlDocument();
  58. doc.LoadHtml(htmlResource);
  59. //遍历div下的a标签
  60. HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes("//*[@class='news_item']/h3/a");
  61. if (nodeList == null) return result;
  62. List<string> urls = new List<string>();
  63. //遍历a标签
  64. foreach (HtmlNode node in nodeList)
  65. {
  66. HtmlAttribute attr = node.Attributes.SingleOrDefault(a => a.Name.Equals("href"));
  67. if (attr != null)
  68. {
  69. string href = attr.Value;
  70. //去重
  71. if (!urls.Contains(href))
  72. {
  73. urls.Add(href);
  74. }
  75. }
  76. }
  77. //爬取新闻主题
  78. foreach (var url1 in urls)
  79. {
  80. var YCNews = GetNewsModel(url1);
  81. if (YCNews.FullHead != null && YCNews.FullHead != "")
  82. {
  83. result.Add(YCNews);
  84. }
  85. }
  86. }
  87. catch (Exception ex)
  88. {
  89. log.Error(GetType(),
  90. string.Format("【{0}】通过主抓取中超新闻时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
  91. }
  92. return result;
  93. }
  94. /// <summary>
  95. /// 获取新闻的主题内容
  96. /// </summary>
  97. /// <param name="url"></param>
  98. /// <returns></returns>
  99. private Base_News GetNewsModel(string url)
  100. {
  101. Base_News YCNew = new Base_News();
  102. try
  103. {
  104. var htmlResource = NetHelper.GetUrlResponse(url, Encoding.GetEncoding("gb2312"));
  105. if (htmlResource == null) return YCNew;
  106. HtmlDocument doc = new HtmlDocument();
  107. doc.LoadHtml(htmlResource);
  108. var div = doc.DocumentNode.SelectSingleNode("//*[@class='post_content_main']");
  109. if (div == null) return YCNew;
  110. var Title = div.ChildNodes.Where(node => node.Name == "h1").ToList();
  111. var divContent = doc.DocumentNode.SelectSingleNode("//*[@class='post_text']");
  112. if (divContent == null) return YCNew;
  113. string NewContent = divContent.InnerHtml.Trim();
  114. if (NewContent == "")
  115. {
  116. NewContent = null;
  117. }
  118. var timeDiv = doc.DocumentNode.SelectSingleNode("//*[@class='post_time_source']");
  119. YCNew.ReleaseTime = timeDiv.FirstChild.InnerText.Replace("来源:", "").Replace("网易体育", "").Replace("\n", "");
  120. HtmlDocument imgdoc = new HtmlDocument();
  121. imgdoc.LoadHtml(NewContent);
  122. var img = imgdoc.DocumentNode.SelectSingleNode("//*[@class='f_center']/img");
  123. if (img != null)
  124. {
  125. var imgsrc = img.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
  126. YCNew.SourceAddress = imgsrc.Value;
  127. }
  128. YCNew.Id = Guid.NewGuid().ToString();
  129. YCNew.FullHead = Title[0].InnerText.Trim();
  130. YCNew.AuthorName = "zc55128";
  131. YCNew.NewsContent = NoHTML(NewContent);
  132. YCNew.SourceName = "网易体育 中超";
  133. YCNew.TypeId = (int)NewsTypeEnum.中超;
  134. var sql = string.Format(GetLotterySqlByTableName, "Base_DataItemDetail", currentNews.GetEnumDescription());
  135. var res = SqlHelper.ExecuteDataset(CommandType.Text, sql);
  136. if (res != null && res.Tables.Count > 0 && res.Tables[0].Rows.Count > 0)
  137. {
  138. YCNew.CategoryId = res.Tables[0].Rows[0]["Id"].ToString();
  139. }
  140. YCNew.Category = currentNews.GetEnumDescription();
  141. YCNew.CreateDate = DateTime.Now;
  142. }
  143. catch (Exception ex)
  144. {
  145. log.Error(GetType(),
  146. string.Format("【{0}】通过主抓取中超新闻时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
  147. }
  148. return YCNew;
  149. }
  150. /// <summary>
  151. /// 组装主站爬取地址
  152. /// </summary>
  153. /// <param name="config"></param>
  154. /// <returns></returns>
  155. private List<string> GetMainUrl(FCSConfig config)
  156. {
  157. List<string> urlList = new List<string>();
  158. string url = config.MainUrl;
  159. int pages = config.MainUrlPages > 0 ? config.MainUrlPages : 1;
  160. for (int i = 1; i <= pages; i++)
  161. {
  162. string res;
  163. if (i == 1)
  164. {
  165. res = "http://sports.163.com/zc/";
  166. }
  167. else
  168. {
  169. res = string.Format(url, i);
  170. }
  171. if (!urlList.Contains(res))
  172. {
  173. urlList.Add(res);
  174. }
  175. }
  176. return urlList;
  177. }
  178. public static string NoHTML(string html) //去除HTML标记
  179. {
  180. Regex regex1 =
  181. new Regex(@"<script[sS]+</script *>",
  182. RegexOptions.IgnoreCase);
  183. Regex regex2 =
  184. new Regex(@" href *= *[sS]*script *:",
  185. RegexOptions.IgnoreCase);
  186. Regex regex3 =
  187. new Regex(@" no[sS]*=",
  188. RegexOptions.IgnoreCase);
  189. Regex regex4 =
  190. new Regex(@"<iframe[sS]+</iframe *>",
  191. RegexOptions.IgnoreCase);
  192. Regex regex5 =
  193. new Regex(@"<frameset[sS]+</frameset *>",
  194. RegexOptions.IgnoreCase);
  195. Regex regex6 =
  196. new Regex(@"<img[^>]+>",
  197. RegexOptions.IgnoreCase);
  198. //Regex regex7 =
  199. // new Regex(@"</p>",
  200. // RegexOptions.IgnoreCase);
  201. //Regex regex8 =
  202. // new Regex(@"<p>",
  203. //RegexOptions.IgnoreCase);
  204. Regex regex9 =
  205. new Regex(@"<[^>]*>",
  206. RegexOptions.IgnoreCase);
  207. html = regex1.Replace(html, ""); //过滤<script></script>标记
  208. html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
  209. html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
  210. html = regex4.Replace(html, ""); //过滤iframe
  211. html = regex5.Replace(html, ""); //过滤frameset
  212. html = regex6.Replace(html, ""); //过滤frameset
  213. html = regex9.Replace(html, "");
  214. html = Regex.Replace(html, "[\f\n\r\t\v]", ""); //过滤回车换行制表符
  215. int index = html.IndexOf("本文来源");//删除文本来源及责任编辑
  216. if (index != -1)
  217. {
  218. html = html.Substring(0, index - 1);
  219. }
  220. html = html.Replace("网易体育", "彩吧足球");
  221. return html;
  222. }
  223. #region 初始化信息
  224. /// <summary>
  225. /// 枚举类型
  226. /// </summary>
  227. private NewsTypeEnum currentNews => NewsTypeEnum.中超;
  228. #endregion 初始化信息
  229. #region SQL语句
  230. /// <summary>
  231. ///查询类别对应的id
  232. /// </summary>
  233. private static string GetLotterySqlByTableName = @"SELECT TOP 1 [ID],[ItemId],[ItemName] FROM [dbo].[{0}] where ItemCode='NewsCategory' and [ItemName]='{1}' ";//WHERE [IsChecked] = 1 AND [IsPassed] = 1
  234. #endregion
  235. }
  236. }