FootballMatchJob.cs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. using System;
  2. using System.Collections.Generic;
  3. using FCS.Common;
  4. using FCS.Crawler.Tools;
  5. using FCS.Interface;
  6. using FCS.Models;
  7. using HtmlAgilityPack;
  8. using Newtonsoft.Json;
  9. using Quartz;
  10. using System.Data;
  11. using System.Linq;
  12. using System.Text;
  13. using System.Text.RegularExpressions;
  14. namespace FCS.Crawler.ZCLotteryMatchs
  15. {
  16. /// <summary>
  17. /// 足球赛事
  18. /// </summary>
  19. public class FootballMatchJob : CommonJob, IJob
  20. {
  21. private static List<F_Area> f_Areas = new List<F_Area>();//地区列表
  22. public FootballMatchJob()
  23. {
  24. log = new LogHelper();
  25. services = IOC.Resolve<IDTOpenCode>();
  26. }
  27. public void Execute(IJobExecutionContext context)
  28. {
  29. Config = CommonHelper.GetConfigFromDataMap(context.JobDetail.JobDataMap);
  30. GetAll();
  31. }
  32. /// <summary>
  33. /// 执行主站技巧
  34. /// </summary>
  35. public void GetAll()
  36. {
  37. List<string> urls = new List<string> { "http://saishi.zgzcw.com/soccer/" };
  38. foreach (string url in urls)
  39. {
  40. //爬取区域国家下的赛事
  41. GetCOntryMatch(url);
  42. //爬取区域的洲赛事
  43. GetAreaMatch(url);
  44. }
  45. }
  46. /// <summary>
  47. /// 获取区域下的州赛赛事,欧冠一类的
  48. /// </summary>
  49. /// <param name="mainUrl"></param>
  50. private void GetAreaMatch(string mainUrl)
  51. {
  52. var url = new Uri(mainUrl);
  53. var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("utf-8"));
  54. if (htmlResource == null) return;
  55. HtmlDocument doc = new HtmlDocument();
  56. doc.LoadHtml(htmlResource);
  57. //获取枚举
  58. var enumData = services.GetDataItem(DataItemDetailEnum.FootBallMatchType);
  59. //区域集合
  60. HtmlNodeCollection areaHtml = doc.DocumentNode.SelectNodes("//*[@class='mbcon mbr fl']");
  61. if (areaHtml == null) return;
  62. foreach (var areaItem in areaHtml)
  63. {
  64. if (areaItem.InnerText == null || areaItem.InnerText == "" || areaItem.InnerText.Contains("世界排名"))
  65. {
  66. continue;
  67. }
  68. HtmlDocument doc1 = new HtmlDocument();
  69. doc1.LoadHtml(areaItem.InnerHtml);
  70. //获取区域
  71. var area = doc1.DocumentNode.SelectNodes("//span")[0].InnerText.Trim();
  72. HtmlNodeCollection matchHtml = doc1.DocumentNode.SelectNodes("//div/a");
  73. if (matchHtml == null) continue;
  74. foreach (var matchitem in matchHtml)
  75. {
  76. HtmlDocument imgDoc = new HtmlDocument();
  77. imgDoc.LoadHtml(matchitem.InnerHtml);
  78. var imgHtml = imgDoc.DocumentNode.SelectNodes("//div/img").FirstOrDefault();
  79. HtmlAttribute img = imgHtml.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
  80. F_Events events = new F_Events();
  81. events.Id = Guid.NewGuid().ToString();
  82. events.Name = matchitem.InnerText.Trim();
  83. events.AreaId = f_Areas.Where(o => o.Name == area).Count() != 0 ? f_Areas.Where(o => o.Name == area).FirstOrDefault().Id : "";
  84. events.LogoImage = img.Value;
  85. switch (area)
  86. {
  87. case "欧洲赛事":
  88. events.EventType = enumData.Where(o => o.Name == "欧洲赛事").Select(o => o.Id).ToList()[0].ToString();
  89. break;
  90. case "亚洲赛事":
  91. events.EventType = enumData.Where(o => o.Name == "亚洲赛事").Select(o => o.Id).ToList()[0].ToString();
  92. break;
  93. case "美洲赛事":
  94. events.EventType = enumData.Where(o => o.Name == "美洲赛事").Select(o => o.Id).ToList()[0].ToString();
  95. break;
  96. case "非洲赛事":
  97. events.EventType = enumData.Where(o => o.Name == "非洲赛事").Select(o => o.Id).ToList()[0].ToString();
  98. break;
  99. case "澳洲赛事":
  100. events.EventType = enumData.Where(o => o.Name == "大洋洲赛事").Select(o => o.Id).ToList()[0].ToString();
  101. break;
  102. }
  103. events.Remark = matchitem.Attributes.SingleOrDefault(a => a.Name.Equals("href")).Value;
  104. services.AddEvents(FCSLottery.F_Events, events);
  105. }
  106. }
  107. }
  108. /// <summary>
  109. /// 获取区域赛事
  110. /// </summary>
  111. /// <param name="mainUrl"></param>
  112. /// <returns></returns>
  113. private void GetCOntryMatch(string mainUrl)
  114. {
  115. try
  116. {
  117. var url = new Uri(mainUrl);
  118. var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("utf-8"));
  119. if (htmlResource == null) return;
  120. HtmlDocument doc = new HtmlDocument();
  121. doc.LoadHtml(htmlResource);
  122. //获取枚举
  123. var enumData = services.GetDataItem(DataItemDetailEnum.FootBallMatchType);
  124. //区域集合
  125. HtmlNodeCollection areaHtml = doc.DocumentNode.SelectNodes("//*[@class='mbcon mbl fl']");
  126. if (areaHtml == null) return;
  127. foreach (var areaItem in areaHtml)
  128. {
  129. if (areaItem.InnerText == null || areaItem.InnerText == "")
  130. {
  131. continue;
  132. }
  133. HtmlDocument doc1 = new HtmlDocument();
  134. doc1.LoadHtml(areaItem.InnerHtml);
  135. if (doc1.DocumentNode.SelectNodes("//span") == null || doc1.DocumentNode.SelectNodes("//span").Count == 0)
  136. {
  137. continue;
  138. }
  139. //获取区域
  140. var area = doc1.DocumentNode.SelectNodes("//span")[0].InnerText.Trim();
  141. // 插入区域到区域表中
  142. F_Area areaData = new F_Area();
  143. areaData.Id = Guid.NewGuid().ToString();
  144. areaData.Name = area;
  145. services.AddArea(FCSLottery.F_Area, areaData);
  146. f_Areas.Add(areaData);
  147. //国家集合
  148. HtmlNodeCollection CountryHtml = doc1.DocumentNode.SelectNodes("//*[@class='ls']/div");
  149. if (CountryHtml == null)
  150. {
  151. continue;
  152. }
  153. foreach (var countryItem in CountryHtml)
  154. {
  155. HtmlDocument countryDoc = new HtmlDocument();
  156. countryDoc.LoadHtml(countryItem.InnerHtml);
  157. if (countryItem.InnerHtml == null || countryItem.InnerHtml == "")
  158. {
  159. continue;
  160. }
  161. //获取国家
  162. if (countryDoc.DocumentNode.SelectNodes("//div") == null || countryDoc.DocumentNode.SelectNodes("//div").Count == 0)
  163. {
  164. continue;
  165. }
  166. var country = countryDoc.DocumentNode.SelectNodes("//div")[0].InnerText.Trim();
  167. //TODO 把国家信息插入数据库
  168. F_Area countryData = new F_Area();
  169. countryData.Id = Guid.NewGuid().ToString();
  170. countryData.Name = country;
  171. countryData.ParentId = areaData.Id;
  172. if (areaData.Name == "杯赛赛事")
  173. {
  174. F_Events events = new F_Events();
  175. events.Id = Guid.NewGuid().ToString();
  176. events.Name = country;
  177. events.AreaId = areaData.Id;
  178. events.EventType = enumData.Where(o => o.Name == "国际赛事").Select(o => o.Id).ToList()[0].ToString();
  179. HtmlNodeCollection href = countryDoc.DocumentNode.SelectNodes("//a");
  180. events.Remark = href == null ? "" : href[0].Attributes.SingleOrDefault(a => a.Name.Equals("href")).Value;
  181. HtmlNodeCollection img = countryDoc.DocumentNode.SelectNodes("//a/img");
  182. events.LogoImage = img == null ? "" : img[0].Attributes.SingleOrDefault(a => a.Name.Equals("src")).Value;
  183. services.AddEvents(FCSLottery.F_Events, events);
  184. }
  185. else
  186. {
  187. services.AddArea(FCSLottery.F_Area, countryData);
  188. //赛事集合
  189. HtmlNodeCollection matchHtml = countryDoc.DocumentNode.SelectNodes("//*[@class='kuang']/a");
  190. if (matchHtml == null)
  191. {
  192. continue;
  193. }
  194. foreach (var matchItem in matchHtml)
  195. {
  196. if (matchItem.InnerHtml == null || matchItem.InnerHtml == "")
  197. {
  198. continue;
  199. }
  200. HtmlDocument imgDoc = new HtmlDocument();
  201. imgDoc.LoadHtml(matchItem.InnerHtml);
  202. var imgHtml = imgDoc.DocumentNode.SelectNodes("//div/img").FirstOrDefault();
  203. HtmlAttribute img = imgHtml.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
  204. HtmlAttribute attr = matchItem.Attributes.SingleOrDefault(a => a.Name.Equals("href"));
  205. //获取赛事名称以及url
  206. F_Events events = new F_Events();
  207. events.Id = Guid.NewGuid().ToString();
  208. events.Name = matchItem.InnerText.Trim();
  209. events.AreaId = countryData.Id;
  210. events.Remark = attr.Value;
  211. events.LogoImage = img.Value;
  212. switch (areaData.Name)
  213. {
  214. case "欧洲赛事":
  215. events.EventType = enumData.Where(o => o.Name == "欧洲赛事").Select(o => o.Id).ToList()[0].ToString();
  216. break;
  217. case "亚洲赛事":
  218. events.EventType = enumData.Where(o => o.Name == "亚洲赛事").Select(o => o.Id).ToList()[0].ToString();
  219. break;
  220. case "美洲赛事":
  221. events.EventType = enumData.Where(o => o.Name == "美洲赛事").Select(o => o.Id).ToList()[0].ToString();
  222. break;
  223. case "非洲赛事":
  224. events.EventType = enumData.Where(o => o.Name == "非洲赛事").Select(o => o.Id).ToList()[0].ToString();
  225. break;
  226. case "澳洲赛事":
  227. events.EventType = enumData.Where(o => o.Name == "大洋洲赛事").Select(o => o.Id).ToList()[0].ToString();
  228. break;
  229. //case "杯赛赛事":
  230. // events.EventType = enumData.Where(o => o.Name == "国际赛事").Select(o => o.Id).ToList()[0].ToString();
  231. // break;
  232. }
  233. services.AddEvents(FCSLottery.F_Events, events);
  234. }
  235. }
  236. }
  237. }
  238. }
  239. catch (Exception ex)
  240. {
  241. log.Error(GetType(),
  242. string.Format("【{0}】通过主抓取篮球分析时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
  243. }
  244. return;
  245. }
  246. /// <summary>
  247. /// 组装主站爬取地址
  248. /// </summary>
  249. /// <param name="config"></param>
  250. /// <returns></returns>
  251. private List<string> GetMainUrl(FCSConfig config)
  252. {
  253. List<string> urlList = new List<string>();
  254. string url = config.MainUrl;
  255. int pages = config.MainUrlPages > 0 ? config.MainUrlPages : 1;
  256. for (int i = 1; i <= pages; i++)
  257. {
  258. string res;
  259. if (i == 1)
  260. {
  261. res = "http://saishi.zgzcw.com/soccer";
  262. }
  263. else
  264. {
  265. res = string.Format(url, i);
  266. }
  267. if (!urlList.Contains(res))
  268. {
  269. urlList.Add(res);
  270. }
  271. }
  272. return urlList;
  273. }
  274. public static string NoHTML(string html) //去除HTML标记
  275. {
  276. Regex regex1 =
  277. new Regex(@"<script[sS]+</script *>",
  278. RegexOptions.IgnoreCase);
  279. Regex regex2 =
  280. new Regex(@" href *= *[sS]*script *:",
  281. RegexOptions.IgnoreCase);
  282. Regex regex3 =
  283. new Regex(@" no[sS]*=",
  284. RegexOptions.IgnoreCase);
  285. Regex regex4 =
  286. new Regex(@"<iframe[sS]+</iframe *>",
  287. RegexOptions.IgnoreCase);
  288. Regex regex5 =
  289. new Regex(@"<frameset[sS]+</frameset *>",
  290. RegexOptions.IgnoreCase);
  291. Regex regex6 =
  292. new Regex(@"<img[^>]+>",
  293. RegexOptions.IgnoreCase);
  294. //Regex regex7 =
  295. // new Regex(@"</p>",
  296. // RegexOptions.IgnoreCase);
  297. //Regex regex8 =
  298. // new Regex(@"<p>",
  299. //RegexOptions.IgnoreCase);
  300. Regex regex9 =
  301. new Regex(@"<[^>]*>",
  302. RegexOptions.IgnoreCase);
  303. html = regex1.Replace(html, ""); //过滤<script></script>标记
  304. html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
  305. html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
  306. html = regex4.Replace(html, ""); //过滤iframe
  307. html = regex5.Replace(html, ""); //过滤frameset
  308. html = regex6.Replace(html, ""); //过滤frameset
  309. html = regex9.Replace(html, "");
  310. html = Regex.Replace(html, "[\f\n\r\t\v]", ""); //过滤回车换行制表符
  311. int index = html.IndexOf("本文来源");//删除文本来源及责任编辑
  312. if (index != -1)
  313. {
  314. html = html.Substring(0, index - 1);
  315. }
  316. html.Replace("网易体育", "彩吧足球");
  317. return html;
  318. }
  319. #region 初始化信息
  320. /// <summary>
  321. /// 枚举类型
  322. /// </summary>
  323. private NewsTypeEnum currentNews => NewsTypeEnum.篮球分析;
  324. #endregion 初始化信息
  325. #region SQL语句
  326. /// <summary>
  327. ///查询类别对应的id
  328. /// </summary>
  329. private static string GetLotterySqlByTableName = @"SELECT TOP 1 [ID],[ItemId],[ItemName] FROM [dbo].[{0}] where [ItemName]='{1}' ";//WHERE [IsChecked] = 1 AND [IsPassed] = 1
  330. #endregion
  331. }
  332. }