FootballMatchJob.cs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. using System;
  2. using System.Collections.Generic;
  3. using FCS.Common;
  4. using FCS.Crawler.Tools;
  5. using FCS.Interface;
  6. using FCS.Models;
  7. using HtmlAgilityPack;
  8. using Newtonsoft.Json;
  9. using Quartz;
  10. using System.Data;
  11. using System.Linq;
  12. using System.Text;
  13. using System.Text.RegularExpressions;
  14. using System.Configuration;
  15. using System.Reflection;
  16. namespace FCS.Crawler.ZCLotteryMatchs
  17. {
  18. /// <summary>
  19. /// 足球赛事
  20. /// </summary>
  21. public class FootballMatchJob : CommonJob, IJob
  22. {
  23. private List<F_Area> f_Areas = new List<F_Area>();//地区列表
  24. public FootballMatchJob()
  25. {
  26. log = new LogHelper();
  27. services = IOC.Resolve<IDTOpenCode>();
  28. }
  29. public void Execute(IJobExecutionContext context)
  30. {
  31. Config = CommonHelper.GetConfigFromDataMap(context.JobDetail.JobDataMap);
  32. GetAll();
  33. }
  34. /// <summary>
  35. /// 执行主站技巧
  36. /// </summary>
  37. public void GetAll()
  38. {
  39. CommonHelper.LogBD(typeof(FootballMatchJob), () => {
  40. List<string> urls = new List<string> { "http://saishi.zgzcw.com/soccer/" };
  41. foreach (string url in urls)
  42. {
  43. //爬取区域国家下的赛事
  44. GetCOntryMatch(url);
  45. //爬取区域的洲赛事
  46. GetAreaMatch(url);
  47. }
  48. });
  49. }
  50. /// <summary>
  51. /// 获取区域下的州赛赛事,欧冠一类的
  52. /// </summary>
  53. /// <param name="mainUrl"></param>
  54. private void GetAreaMatch(string mainUrl)
  55. {
  56. var url = new Uri(mainUrl);
  57. var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("utf-8"));
  58. if (htmlResource == null) return;
  59. HtmlDocument doc = new HtmlDocument();
  60. doc.LoadHtml(htmlResource);
  61. //获取枚举
  62. var enumData = services.GetDataItem(DataItemDetailEnum.FootBallMatchType);
  63. //区域集合
  64. HtmlNodeCollection areaHtml = doc.DocumentNode.SelectNodes("//*[@class='mbcon mbr fl']");
  65. if (areaHtml == null) return;
  66. foreach (var areaItem in areaHtml)
  67. {
  68. if (areaItem.InnerText == null || areaItem.InnerText == "" || areaItem.InnerText.Contains("世界排名"))
  69. {
  70. continue;
  71. }
  72. HtmlDocument doc1 = new HtmlDocument();
  73. doc1.LoadHtml(areaItem.InnerHtml);
  74. //获取区域
  75. var area = doc1.DocumentNode.SelectNodes("//span")[0].InnerText.Trim();
  76. HtmlNodeCollection matchHtml = doc1.DocumentNode.SelectNodes("//div/a");
  77. if (matchHtml == null) continue;
  78. foreach (var matchitem in matchHtml)
  79. {
  80. HtmlDocument imgDoc = new HtmlDocument();
  81. imgDoc.LoadHtml(matchitem.InnerHtml);
  82. var imgHtml = imgDoc.DocumentNode.SelectNodes("//div/img").FirstOrDefault();
  83. HtmlAttribute img = imgHtml.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
  84. F_Events events = new F_Events
  85. {
  86. Id = Guid.NewGuid().ToString(),
  87. Name = matchitem.InnerText.Trim(),
  88. AreaId = f_Areas.Where(o => o.Name == area).Count() != 0 ? f_Areas.Where(o => o.Name == area).FirstOrDefault().Id : "",
  89. LogoImage = img.Value
  90. };
  91. switch (area)
  92. {
  93. case "欧洲赛事":
  94. events.EventType = enumData.Where(o => o.Name == "欧洲赛事").Select(o => o.Id).ToList()[0].ToString();
  95. break;
  96. case "亚洲赛事":
  97. events.EventType = enumData.Where(o => o.Name == "亚洲赛事").Select(o => o.Id).ToList()[0].ToString();
  98. break;
  99. case "美洲赛事":
  100. events.EventType = enumData.Where(o => o.Name == "美洲赛事").Select(o => o.Id).ToList()[0].ToString();
  101. break;
  102. case "非洲赛事":
  103. events.EventType = enumData.Where(o => o.Name == "非洲赛事").Select(o => o.Id).ToList()[0].ToString();
  104. break;
  105. case "澳洲赛事":
  106. events.EventType = enumData.Where(o => o.Name == "大洋洲赛事").Select(o => o.Id).ToList()[0].ToString();
  107. break;
  108. }
  109. events.Remark = matchitem.Attributes.SingleOrDefault(a => a.Name.Equals("href")).Value;
  110. services.AddEvents(FCSLottery.F_Events, events);
  111. }
  112. }
  113. }
  114. /// <summary>
  115. /// 获取区域赛事
  116. /// </summary>
  117. /// <param name="mainUrl"></param>
  118. /// <returns></returns>
  119. private void GetCOntryMatch(string mainUrl)
  120. {
  121. try
  122. {
  123. var doc = CommonHelper.GetHtmlHtmlDocument(new Models.DTO.HtmlParameterDTO { Url = mainUrl, Timeout = 10 * 1000 });
  124. if (doc.DocumentNode.InnerHtml == ConfigurationManager.AppSettings["Termination"])//不存在退出
  125. return;
  126. //获取枚举
  127. var enumData = services.GetDataItem(DataItemDetailEnum.FootBallMatchType);
  128. //区域集合
  129. HtmlNodeCollection areaHtml = doc.DocumentNode.SelectNodes("//*[@class='mbcon mbl fl']");
  130. if (areaHtml == null) return;
  131. foreach (var areaItem in areaHtml)
  132. {
  133. if (areaItem.InnerText == null || areaItem.InnerText == "")
  134. {
  135. continue;
  136. }
  137. HtmlDocument doc1 = new HtmlDocument();
  138. doc1.LoadHtml(areaItem.InnerHtml);
  139. if (doc1.DocumentNode.SelectNodes("//span") == null || doc1.DocumentNode.SelectNodes("//span").Count == 0)
  140. {
  141. continue;
  142. }
  143. //获取区域
  144. var area = doc1.DocumentNode.SelectNodes("//span")[0].InnerText.Trim();
  145. // 插入区域到区域表中
  146. F_Area areaData = new F_Area
  147. {
  148. Id = Guid.NewGuid().ToString(),
  149. Name = area
  150. };
  151. services.AddArea(FCSLottery.F_Area, areaData);
  152. f_Areas.Add(areaData);
  153. //国家集合
  154. HtmlNodeCollection CountryHtml = doc1.DocumentNode.SelectNodes("//*[@class='ls']/div");
  155. if (CountryHtml == null)
  156. {
  157. continue;
  158. }
  159. foreach (var countryItem in CountryHtml)
  160. {
  161. HtmlDocument countryDoc = new HtmlDocument();
  162. countryDoc.LoadHtml(countryItem.InnerHtml);
  163. if (countryItem.InnerHtml == null || countryItem.InnerHtml == "")
  164. {
  165. continue;
  166. }
  167. //获取国家
  168. if (countryDoc.DocumentNode.SelectNodes("//div") == null || countryDoc.DocumentNode.SelectNodes("//div").Count == 0)
  169. {
  170. continue;
  171. }
  172. var country = countryDoc.DocumentNode.SelectNodes("//div")[0].InnerText.Trim();
  173. //TODO 把国家信息插入数据库
  174. F_Area countryData = new F_Area
  175. {
  176. Id = Guid.NewGuid().ToString(),
  177. Name = country,
  178. ParentId = areaData.Id,
  179. LogoImage = countryDoc.DocumentNode.SelectNodes(".//img")[0].Attributes["src"].Value
  180. };
  181. if (areaData.Name == "杯赛赛事")
  182. {
  183. F_Events events = new F_Events();
  184. events.Id = Guid.NewGuid().ToString();
  185. events.Name = country;
  186. events.AreaId = areaData.Id;
  187. events.EventType = enumData.Where(o => o.Name == "国际赛事").Select(o => o.Id).ToList()[0].ToString();
  188. HtmlNodeCollection href = countryDoc.DocumentNode.SelectNodes("//a");
  189. events.Remark = href == null ? "" : href[0].Attributes.SingleOrDefault(a => a.Name.Equals("href")).Value;
  190. HtmlNodeCollection img = countryDoc.DocumentNode.SelectNodes("//a/img");
  191. events.LogoImage = img == null ? "" : img[0].Attributes.SingleOrDefault(a => a.Name.Equals("src")).Value;
  192. services.AddEvents(FCSLottery.F_Events, events);
  193. }
  194. else
  195. {
  196. services.AddArea(FCSLottery.F_Area, countryData);
  197. //赛事集合
  198. HtmlNodeCollection matchHtml = countryDoc.DocumentNode.SelectNodes("//*[@class='kuang']/a");
  199. if (matchHtml == null)
  200. {
  201. continue;
  202. }
  203. foreach (var matchItem in matchHtml)
  204. {
  205. if (matchItem.InnerHtml == null || matchItem.InnerHtml == "")
  206. {
  207. continue;
  208. }
  209. HtmlDocument imgDoc = new HtmlDocument();
  210. imgDoc.LoadHtml(matchItem.InnerHtml);
  211. var imgHtml = imgDoc.DocumentNode.SelectNodes("//div/img").FirstOrDefault();
  212. HtmlAttribute img = imgHtml.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
  213. HtmlAttribute attr = matchItem.Attributes.SingleOrDefault(a => a.Name.Equals("href"));
  214. //获取赛事名称以及url
  215. F_Events events = new F_Events();
  216. events.Id = Guid.NewGuid().ToString();
  217. events.Name = matchItem.InnerText.Trim();
  218. events.AreaId = countryData.Id;
  219. events.Remark = attr.Value;
  220. events.LogoImage = img.Value;
  221. switch (areaData.Name)
  222. {
  223. case "欧洲赛事":
  224. events.EventType = enumData.Where(o => o.Name == "欧洲赛事").Select(o => o.Id).ToList()[0].ToString();
  225. break;
  226. case "亚洲赛事":
  227. events.EventType = enumData.Where(o => o.Name == "亚洲赛事").Select(o => o.Id).ToList()[0].ToString();
  228. break;
  229. case "美洲赛事":
  230. events.EventType = enumData.Where(o => o.Name == "美洲赛事").Select(o => o.Id).ToList()[0].ToString();
  231. break;
  232. case "非洲赛事":
  233. events.EventType = enumData.Where(o => o.Name == "非洲赛事").Select(o => o.Id).ToList()[0].ToString();
  234. break;
  235. case "澳洲赛事":
  236. events.EventType = enumData.Where(o => o.Name == "大洋洲赛事").Select(o => o.Id).ToList()[0].ToString();
  237. break;
  238. //case "杯赛赛事":
  239. // events.EventType = enumData.Where(o => o.Name == "国际赛事").Select(o => o.Id).ToList()[0].ToString();
  240. // break;
  241. }
  242. services.AddEvents(FCSLottery.F_Events, events);
  243. }
  244. }
  245. }
  246. }
  247. }
  248. catch (Exception ex)
  249. {
  250. log.Error(GetType(),
  251. string.Format("【{0}】通过主抓取篮球分析时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
  252. }
  253. return;
  254. }
  255. /// <summary>
  256. /// 组装主站爬取地址
  257. /// </summary>
  258. /// <param name="config"></param>
  259. /// <returns></returns>
  260. private List<string> GetMainUrl(FCSConfig config)
  261. {
  262. List<string> urlList = new List<string>();
  263. string url = config.MainUrl;
  264. int pages = config.MainUrlPages > 0 ? config.MainUrlPages : 1;
  265. for (int i = 1; i <= pages; i++)
  266. {
  267. string res;
  268. if (i == 1)
  269. {
  270. res = "http://saishi.zgzcw.com/soccer";
  271. }
  272. else
  273. {
  274. res = string.Format(url, i);
  275. }
  276. if (!urlList.Contains(res))
  277. {
  278. urlList.Add(res);
  279. }
  280. }
  281. return urlList;
  282. }
  283. public static string NoHTML(string html) //去除HTML标记
  284. {
  285. Regex regex1 =
  286. new Regex(@"<script[sS]+</script *>",
  287. RegexOptions.IgnoreCase);
  288. Regex regex2 =
  289. new Regex(@" href *= *[sS]*script *:",
  290. RegexOptions.IgnoreCase);
  291. Regex regex3 =
  292. new Regex(@" no[sS]*=",
  293. RegexOptions.IgnoreCase);
  294. Regex regex4 =
  295. new Regex(@"<iframe[sS]+</iframe *>",
  296. RegexOptions.IgnoreCase);
  297. Regex regex5 =
  298. new Regex(@"<frameset[sS]+</frameset *>",
  299. RegexOptions.IgnoreCase);
  300. Regex regex6 =
  301. new Regex(@"<img[^>]+>",
  302. RegexOptions.IgnoreCase);
  303. //Regex regex7 =
  304. // new Regex(@"</p>",
  305. // RegexOptions.IgnoreCase);
  306. //Regex regex8 =
  307. // new Regex(@"<p>",
  308. //RegexOptions.IgnoreCase);
  309. Regex regex9 =
  310. new Regex(@"<[^>]*>",
  311. RegexOptions.IgnoreCase);
  312. html = regex1.Replace(html, ""); //过滤<script></script>标记
  313. html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
  314. html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
  315. html = regex4.Replace(html, ""); //过滤iframe
  316. html = regex5.Replace(html, ""); //过滤frameset
  317. html = regex6.Replace(html, ""); //过滤frameset
  318. html = regex9.Replace(html, "");
  319. html = Regex.Replace(html, "[\f\n\r\t\v]", ""); //过滤回车换行制表符
  320. int index = html.IndexOf("本文来源");//删除文本来源及责任编辑
  321. if (index != -1)
  322. {
  323. html = html.Substring(0, index - 1);
  324. }
  325. html.Replace("网易体育", "彩吧足球");
  326. return html;
  327. }
  328. #region 初始化信息
  329. /// <summary>
  330. /// 枚举类型
  331. /// </summary>
  332. private NewsTypeEnum currentNews => NewsTypeEnum.篮球分析;
  333. #endregion 初始化信息
  334. #region SQL语句
  335. /// <summary>
  336. ///查询类别对应的id
  337. /// </summary>
  338. private static string GetLotterySqlByTableName = @"SELECT TOP 1 [ID],[ItemId],[ItemName] FROM [dbo].[{0}] where [ItemName]='{1}' ";//WHERE [IsChecked] = 1 AND [IsPassed] = 1
  339. #endregion
  340. }
  341. }