Intel_FootBallNewsJob.cs 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. using FCS.Common;
  2. using FCS.Interface;
  3. using FCS.Models;
  4. using HtmlAgilityPack;
  5. using Quartz;
  6. using System;
  7. using System.Collections.Generic;
  8. using System.Data;
  9. using System.Linq;
  10. using System.Text;
  11. using System.Text.RegularExpressions;
  12. using System.Threading.Tasks;
  13. namespace FCS.Crawler.ZCLotteryNews
  14. {
  15. /// <summary>
  16. /// 国际新闻
  17. /// </summary>
  18. public class Intel_FootBallNewsJob : CommonJob, IJob
  19. {
  20. public Intel_FootBallNewsJob()
  21. {
  22. log = new LogHelper();
  23. services = IOC.Resolve<IDTOpenCode>();
  24. }
  25. public void Execute(IJobExecutionContext context)
  26. {
  27. Config = CommonHelper.GetConfigFromDataMap(context.JobDetail.JobDataMap);
  28. GetAll();
  29. }
  30. /// <summary>
  31. /// 执行主站技巧
  32. /// </summary>
  33. public void GetAll()
  34. {
  35. List<string> urls = new List<string> { "http://sports.163.com/world/" };
  36. foreach (string url in urls)
  37. {
  38. //获取国际的新闻列表,
  39. List<Base_News> yc_news = GetOpenListFromMainUrl(url);
  40. foreach (var newItem in yc_news)
  41. {
  42. services.AddNews(currentNews, newItem);
  43. }
  44. }
  45. }
  46. /// <summary>
  47. /// 爬取网易的新闻列表
  48. /// </summary>
  49. /// <param name="mainUrl"></param>
  50. /// <returns></returns>
  51. private List<Base_News> GetOpenListFromMainUrl(string mainUrl)
  52. {
  53. var result = new List<Base_News>();
  54. try
  55. {
  56. var url = new Uri(mainUrl);
  57. var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("gbk"));
  58. if (htmlResource == null) return result;
  59. HtmlDocument doc = new HtmlDocument();
  60. doc.LoadHtml(htmlResource);
  61. //遍历div下的a标签
  62. HtmlNodeCollection nodeList1 = doc.DocumentNode.SelectNodes("//*[@class='topnews']/ul/li/a");
  63. HtmlNodeCollection nodeList2 = doc.DocumentNode.SelectNodes("//*[@class='topnews']/h2/a");
  64. if (nodeList1 == null && nodeList2 == null) return result;
  65. List<string> urls = new List<string>();
  66. //遍历a标签
  67. foreach (HtmlNode node in nodeList1)
  68. {
  69. HtmlAttribute attr = node.Attributes.SingleOrDefault(a => a.Name.Equals("href"));
  70. if (attr != null)
  71. {
  72. string href = attr.Value;
  73. //去重
  74. if (!urls.Contains(href))
  75. {
  76. urls.Add(href);
  77. }
  78. }
  79. }
  80. foreach (HtmlNode node in nodeList2)
  81. {
  82. HtmlAttribute attr = node.Attributes.SingleOrDefault(a => a.Name.Equals("href"));
  83. if (attr != null)
  84. {
  85. string href = attr.Value;
  86. //去重
  87. if (!urls.Contains(href))
  88. {
  89. urls.Add(href);
  90. }
  91. }
  92. }
  93. //爬取新闻主题
  94. foreach (var url1 in urls)
  95. {
  96. var YCNews = GetNewsModel(url1);
  97. if (YCNews.FullHead != null && YCNews.FullHead != "")
  98. {
  99. result.Add(YCNews);
  100. }
  101. }
  102. }
  103. catch (Exception ex)
  104. {
  105. log.Error(GetType(),
  106. string.Format("【{0}】通过主抓取国际新闻信息时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
  107. }
  108. return result;
  109. }
  110. /// <summary>
  111. /// 获取新闻的主题内容
  112. /// </summary>
  113. /// <param name="url"></param>
  114. /// <returns></returns>
  115. private Base_News GetNewsModel(string url)
  116. {
  117. Base_News YCNew = new Base_News();
  118. try
  119. {
  120. var htmlResource = NetHelper.GetUrlResponse(url, Encoding.GetEncoding("gb2312"));
  121. if (htmlResource == null) return YCNew;
  122. HtmlDocument doc = new HtmlDocument();
  123. doc.LoadHtml(htmlResource);
  124. var div = doc.DocumentNode.SelectSingleNode("//*[@class='post_content_main']");
  125. if (div == null) return YCNew;
  126. var Title = div.ChildNodes.Where(node => node.Name == "h1").ToList();
  127. var divContent = doc.DocumentNode.SelectSingleNode("//*[@class='post_text']");
  128. if (divContent == null) return YCNew;
  129. string NewContent = divContent.InnerHtml.Trim();
  130. if (NewContent == "")
  131. {
  132. NewContent = null;
  133. }
  134. var timeDiv = doc.DocumentNode.SelectSingleNode("//*[@class='post_time_source']");
  135. YCNew.ReleaseTime = timeDiv.FirstChild.InnerText.Replace("来源:", "").Replace("网易体育", "").Replace("\n", "");
  136. HtmlDocument imgdoc = new HtmlDocument();
  137. imgdoc.LoadHtml(NewContent);
  138. var img = imgdoc.DocumentNode.SelectSingleNode("//*[@class='f_center']/img");
  139. if (img != null)
  140. {
  141. var imgsrc = img.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
  142. YCNew.SourceAddress = imgsrc.Value;
  143. }
  144. YCNew.Id = Guid.NewGuid().ToString();
  145. YCNew.FullHead = Title[0].InnerText.Trim();
  146. YCNew.AuthorName = "zc55128";
  147. YCNew.NewsContent = NoHTML(NewContent);
  148. YCNew.SourceName = "网易体育 国际";
  149. YCNew.TypeId = (int)NewsTypeEnum.国际;
  150. var sql = string.Format(GetLotterySqlByTableName, "Base_DataItemDetail", currentNews.GetEnumDescription());
  151. var res = SqlHelper.ExecuteDataset(CommandType.Text, sql);
  152. if (res != null && res.Tables.Count > 0 && res.Tables[0].Rows.Count > 0)
  153. {
  154. YCNew.CategoryId = res.Tables[0].Rows[0]["Id"].ToString();
  155. }
  156. YCNew.Category = currentNews.GetEnumDescription();
  157. YCNew.CreateDate = DateTime.Now;
  158. }
  159. catch (Exception ex)
  160. {
  161. log.Error(GetType(),
  162. string.Format("【{0}】通过主抓取国际新闻时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
  163. }
  164. return YCNew;
  165. }
  166. public static string NoHTML(string html) //去除HTML标记
  167. {
  168. Regex regex1 =
  169. new Regex(@"<script[sS]+</script *>",
  170. RegexOptions.IgnoreCase);
  171. Regex regex2 =
  172. new Regex(@" href *= *[sS]*script *:",
  173. RegexOptions.IgnoreCase);
  174. Regex regex3 =
  175. new Regex(@" no[sS]*=",
  176. RegexOptions.IgnoreCase);
  177. Regex regex4 =
  178. new Regex(@"<iframe[sS]+</iframe *>",
  179. RegexOptions.IgnoreCase);
  180. Regex regex5 =
  181. new Regex(@"<frameset[sS]+</frameset *>",
  182. RegexOptions.IgnoreCase);
  183. Regex regex6 =
  184. new Regex(@"<img[^>]+>",
  185. RegexOptions.IgnoreCase);
  186. //Regex regex7 =
  187. // new Regex(@"</p>",
  188. // RegexOptions.IgnoreCase);
  189. //Regex regex8 =
  190. // new Regex(@"<p>",
  191. //RegexOptions.IgnoreCase);
  192. Regex regex9 =
  193. new Regex(@"<[^>]*>",
  194. RegexOptions.IgnoreCase);
  195. html = regex1.Replace(html, ""); //过滤<script></script>标记
  196. html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
  197. html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
  198. html = regex4.Replace(html, ""); //过滤iframe
  199. html = regex5.Replace(html, ""); //过滤frameset
  200. html = regex6.Replace(html, ""); //过滤frameset
  201. html = regex9.Replace(html, "");
  202. html = Regex.Replace(html, "[\f\n\r\t\v]", ""); //过滤回车换行制表符
  203. int index = html.IndexOf("本文来源");//删除文本来源及责任编辑
  204. if (index != -1)
  205. {
  206. html = html.Substring(0, index - 1);
  207. }
  208. html = html.Replace("网易体育", "彩吧足球");
  209. return html;
  210. }
  211. #region 初始化信息
  212. /// <summary>
  213. /// 枚举类型
  214. /// </summary>
  215. private NewsTypeEnum currentNews => NewsTypeEnum.国际;
  216. #endregion 初始化信息
  217. #region SQL语句
  218. /// <summary>
  219. ///查询类别对应的id
  220. /// </summary>
  221. private static string GetLotterySqlByTableName = @"SELECT TOP 1 [ID],[ItemId],[ItemName] FROM [dbo].[{0}] where ItemCode='NewsCategory' and [ItemName]='{1}' ";//WHERE [IsChecked] = 1 AND [IsPassed] = 1
  222. #endregion
  223. }
  224. }