CBA_FootBallNewsJob.cs 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. using FCS.Common;
  2. using FCS.Crawler.Tools;
  3. using FCS.Interface;
  4. using FCS.Models;
  5. using HtmlAgilityPack;
  6. using Newtonsoft.Json;
  7. using Quartz;
  8. using System;
  9. using System.Collections.Generic;
  10. using System.Data;
  11. using System.Linq;
  12. using System.Text;
  13. using System.Text.RegularExpressions;
  14. namespace FCS.Crawler.ZCLotteryNews
  15. {
  16. /// <summary>
  17. /// CBA新闻
  18. /// </summary>
  19. ///
  20. public class CBA_FootBallNewsJob : CommonJob, IJob
  21. {
  22. public CBA_FootBallNewsJob()
  23. {
  24. log = new LogHelper();
  25. services = IOC.Resolve<IDTOpenCode>();
  26. }
  27. public void Execute(IJobExecutionContext context)
  28. {
  29. Config = CommonHelper.GetConfigFromDataMap(context.JobDetail.JobDataMap);
  30. GetAll();
  31. }
  32. /// <summary>
  33. /// 执行主站技巧
  34. /// </summary>
  35. public void GetAll()
  36. {
  37. List<string> urls = new List<string> { "http://sports.163.com/cba/" };
  38. foreach (string url in urls)
  39. {
  40. //获取NBA的新闻列表,
  41. List<Base_News> yc_news = GetOpenListFromMainUrl(url);
  42. foreach (var newItem in yc_news)
  43. {
  44. services.AddNews(currentNews, newItem);
  45. }
  46. }
  47. }
  48. /// <summary>
  49. /// 爬取网易的新闻列表
  50. /// </summary>
  51. /// <param name="mainUrl"></param>
  52. /// <returns></returns>
  53. private List<Base_News> GetOpenListFromMainUrl(string mainUrl)
  54. {
  55. var result = new List<Base_News>();
  56. try
  57. {
  58. var url = new Uri(mainUrl);
  59. var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("gbk"));
  60. if (htmlResource == null) return result;
  61. HtmlDocument doc = new HtmlDocument();
  62. doc.LoadHtml(htmlResource);
  63. //遍历div下的a标签
  64. HtmlNodeCollection nodeList1 = doc.DocumentNode.SelectNodes("//*[@class='topnews']/ul/li/a");
  65. HtmlNodeCollection nodeList2 = doc.DocumentNode.SelectNodes("//*[@class='topnews']/h2/a");
  66. if (nodeList1 == null && nodeList2 == null) return result;
  67. List<string> urls = new List<string>();
  68. //遍历a标签
  69. foreach (HtmlNode node in nodeList1)
  70. {
  71. HtmlAttribute attr = node.Attributes.SingleOrDefault(a => a.Name.Equals("href"));
  72. if (attr != null)
  73. {
  74. string href = attr.Value;
  75. //去重
  76. if (!urls.Contains(href))
  77. {
  78. urls.Add(href);
  79. }
  80. }
  81. }
  82. foreach (HtmlNode node in nodeList2)
  83. {
  84. HtmlAttribute attr = node.Attributes.SingleOrDefault(a => a.Name.Equals("href"));
  85. if (attr != null)
  86. {
  87. string href = attr.Value;
  88. //去重
  89. if (!urls.Contains(href))
  90. {
  91. urls.Add(href);
  92. }
  93. }
  94. }
  95. //爬取新闻主题
  96. foreach (var url1 in urls)
  97. {
  98. var YCNews = GetNewsModel(url1);
  99. if (YCNews.FullHead != null && YCNews.FullHead != "")
  100. {
  101. result.Add(YCNews);
  102. }
  103. }
  104. }
  105. catch (Exception ex)
  106. {
  107. log.Error(GetType(),
  108. string.Format("【{0}】通过主抓取CBA新闻信息时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
  109. }
  110. return result;
  111. }
  112. /// <summary>
  113. /// 获取新闻的主题内容
  114. /// </summary>
  115. /// <param name="url"></param>
  116. /// <returns></returns>
  117. private Base_News GetNewsModel(string url)
  118. {
  119. Base_News YCNew = new Base_News();
  120. try
  121. {
  122. var htmlResource = NetHelper.GetUrlResponse(url, Encoding.GetEncoding("gb2312"));
  123. if (htmlResource == null) return YCNew;
  124. HtmlDocument doc = new HtmlDocument();
  125. doc.LoadHtml(htmlResource);
  126. var div = doc.DocumentNode.SelectSingleNode("//*[@class='post_content_main']");
  127. if (div == null) return YCNew;
  128. var Title = div.ChildNodes.Where(node => node.Name == "h1").ToList();
  129. var divContent = doc.DocumentNode.SelectSingleNode("//*[@class='post_text']");
  130. if (divContent == null) return YCNew;
  131. string NewContent = divContent.InnerHtml.Trim();
  132. if (NewContent == "")
  133. {
  134. NewContent = null;
  135. }
  136. var timeDiv = doc.DocumentNode.SelectSingleNode("//*[@class='post_time_source']");
  137. YCNew.ReleaseTime = timeDiv.FirstChild.InnerText.Replace("来源:","").Replace("网易体育","").Replace("\n","");
  138. HtmlDocument imgdoc = new HtmlDocument();
  139. imgdoc.LoadHtml(NewContent);
  140. var img = imgdoc.DocumentNode.SelectSingleNode("//*[@class='f_center']/img");
  141. if (img!=null)
  142. {
  143. var imgsrc = img.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
  144. YCNew.SourceAddress = imgsrc.Value;
  145. }
  146. YCNew.Id = Guid.NewGuid().ToString();
  147. YCNew.FullHead = Title[0].InnerText.Trim();
  148. YCNew.AuthorName = "zc55128";
  149. YCNew.NewsContent = NoHTML(NewContent);
  150. YCNew.SourceName = "网易体育 CBA";
  151. YCNew.TypeId = (int)NewsTypeEnum.CBA;
  152. var sql = string.Format(GetLotterySqlByTableName, "Base_DataItemDetail", currentNews.GetEnumDescription());
  153. var res = SqlHelper.ExecuteDataset(CommandType.Text, sql);
  154. if (res != null && res.Tables.Count > 0 && res.Tables[0].Rows.Count > 0)
  155. {
  156. YCNew.CategoryId = res.Tables[0].Rows[0]["Id"].ToString();
  157. }
  158. YCNew.Category = currentNews.GetEnumDescription();
  159. YCNew.CreateDate = DateTime.Now;
  160. }
  161. catch (Exception ex)
  162. {
  163. log.Error(GetType(),
  164. string.Format("【{0}】通过主抓取CBA新闻时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
  165. }
  166. return YCNew;
  167. }
  168. /// <summary>
  169. /// 组装主站爬取地址
  170. /// </summary>
  171. /// <param name="config"></param>
  172. /// <returns></returns>
  173. private List<string> GetMainUrl(FCSConfig config)
  174. {
  175. List<string> urlList = new List<string>();
  176. string url = config.MainUrl;
  177. int pages = config.MainUrlPages > 0 ? config.MainUrlPages : 1;
  178. for (int i = 1; i <= pages; i++)
  179. {
  180. string res;
  181. if (i == 1)
  182. {
  183. res = "http://sports.163.com/cba/";
  184. }
  185. else
  186. {
  187. res = string.Format(url, i);
  188. }
  189. if (!urlList.Contains(res))
  190. {
  191. urlList.Add(res);
  192. }
  193. }
  194. return urlList;
  195. }
  196. public static string NoHTML(string html) //去除HTML标记
  197. {
  198. Regex regex1 =
  199. new Regex(@"<script[sS]+</script *>",
  200. RegexOptions.IgnoreCase);
  201. Regex regex2 =
  202. new Regex(@" href *= *[sS]*script *:",
  203. RegexOptions.IgnoreCase);
  204. Regex regex3 =
  205. new Regex(@" no[sS]*=",
  206. RegexOptions.IgnoreCase);
  207. Regex regex4 =
  208. new Regex(@"<iframe[sS]+</iframe *>",
  209. RegexOptions.IgnoreCase);
  210. Regex regex5 =
  211. new Regex(@"<frameset[sS]+</frameset *>",
  212. RegexOptions.IgnoreCase);
  213. Regex regex6 =
  214. new Regex(@"<img[^>]+>",
  215. RegexOptions.IgnoreCase);
  216. //Regex regex7 =
  217. // new Regex(@"</p>",
  218. // RegexOptions.IgnoreCase);
  219. //Regex regex8 =
  220. // new Regex(@"<p>",
  221. //RegexOptions.IgnoreCase);
  222. Regex regex9 =
  223. new Regex(@"<[^>]*>",
  224. RegexOptions.IgnoreCase);
  225. html = regex1.Replace(html, ""); //过滤<script></script>标记
  226. html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
  227. html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
  228. html = regex4.Replace(html, ""); //过滤iframe
  229. html = regex5.Replace(html, ""); //过滤frameset
  230. html = regex6.Replace(html, ""); //过滤frameset
  231. html = regex9.Replace(html, "");
  232. html = Regex.Replace(html, "[\f\n\r\t\v]", ""); //过滤回车换行制表符
  233. int index = html.IndexOf("本文来源");//删除文本来源及责任编辑
  234. if (index != -1)
  235. {
  236. html = html.Substring(0, index - 1);
  237. }
  238. html = html.Replace("网易体育", "彩吧足球");
  239. return html;
  240. }
  241. #region 初始化信息
  242. /// <summary>
  243. /// 枚举类型
  244. /// </summary>
  245. private NewsTypeEnum currentNews => NewsTypeEnum.CBA;
  246. #endregion 初始化信息
  247. #region SQL语句
  248. /// <summary>
  249. ///查询类别对应的id
  250. /// </summary>
  251. private static string GetLotterySqlByTableName = @"SELECT TOP 1 [ID],[ItemId],[ItemName] FROM [dbo].[{0}] where ItemCode='NewsCategory' and [ItemName]='{1}' ";//WHERE [IsChecked] = 1 AND [IsPassed] = 1
  252. #endregion
  253. }
  254. }