GroupingJob.cs 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Text;
  5. using FCS.Common;
  6. using FCS.Interface;
  7. using FCS.Models;
  8. using HtmlAgilityPack;
  9. using Quartz;
  10. using System.Threading.Tasks;
  11. using System.Diagnostics;
  12. using System.Threading;
  13. using System.Web;
  14. using System.IO;
  15. using Newtonsoft.Json;
  16. using System.Net;
  17. using FCS.Crawler.ZCLotteryIP;
  18. using System.Configuration;
  19. using FCS.Models.DTO;
  20. namespace FCS.Crawler.ZCLotteryGrouping
  21. {
  22. public class GroupingJob : CommonJob
  23. {
  24. private List<F_Events> eventList;//赛事信息
  25. public GroupingJob()
  26. {
  27. eventList = services.Query<F_Events>().ToList();
  28. //eventList = (from a in eventList
  29. // where a.PK == 229
  30. // select a).ToList();
  31. // pageSize = int.Parse(ConfigurationManager.AppSettings["PagSize"]);
  32. // pageIndexCount = eventList.Count % pageSize > 0
  33. // ? eventList.Count / pageSize + 1
  34. // : eventList.Count / pageSize;
  35. }
  36. /// <summary>
  37. /// 分组更新任务
  38. /// </summary>
  39. /// <param name="mainUrl"></param>
  40. /// <param name="isUpdate"></param>
  41. public void Click(string mainUrl = "http://saishi.zgzcw.com", bool isUpdate = false)
  42. {
  43. var result = new List<F_Grouping>();
  44. eventList = eventList.Where(p => !p.Name.Contains("球会友谊")).ToList();
  45. eventList.ForEach(p =>
  46. {
  47. //异步加载分组
  48. taskList.Add(Task.Run(() =>
  49. {
  50. #region 获取期数、赛季
  51. var doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = mainUrl + p.Remark, Title = "足彩" });//加载主页面
  52. var dict = new Dictionary<string, string>();//用以记录赛季的URL和名称
  53. var seasonParentData = doc.DocumentNode.SelectSingleNode("//div[@class='div-select hoverSelect']");//赛季HTML父节点
  54. if (seasonParentData == null || doc.DocumentNode.InnerHtml == ConfigurationManager.AppSettings["Termination"])
  55. return;
  56. var seasonData = seasonParentData.SelectNodes(".//a");//赛季a标签
  57. foreach (HtmlNode item in seasonData)
  58. {
  59. var li = item.SelectNodes(".//li");
  60. if (li.Count <= 0)
  61. continue;
  62. if (li[0].InnerText.Contains(DateTime.Now.Year.ToString())
  63. && !li[0].InnerText.Contains(DateTime.Now.AddYears(-1).Year.ToString())
  64. && !dict.ContainsKey(li[0].InnerText))
  65. dict.Add(li[0].InnerText, item.Attributes["href"].Value);//名称,URL
  66. }
  67. #endregion
  68. foreach (var item in dict)
  69. {
  70. taskList.Add(Task.Run(() =>
  71. {
  72. doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = item.Value, Title = "足彩", Timeout = 10 * 1000, NotIpNumber = 200 });
  73. var nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层
  74. if (nodeList == null || doc.DocumentNode.InnerHtml == ConfigurationManager.AppSettings["Termination"])
  75. return;
  76. #region table_head cup-t,table_head两个层选择判断
  77. var name = "//div[@class='table_head cup-t']";
  78. var table_head = nodeList.SelectNodes("//div[@class='table_head cup-t']");
  79. if (table_head == null)
  80. {
  81. table_head = nodeList.SelectNodes("//div[@class='table_head']");
  82. name = "//div[@class='table_head']";
  83. }
  84. var parentData = nodeList.SelectSingleNode(name).SelectSingleNode("./ul").SelectNodes(".//li");
  85. #endregion
  86. if (parentData == null)
  87. return;
  88. var _dict = new Dictionary<string, string>();//用以记录新增的父节点的Id
  89. result = SetData(result, parentData, p.Id, item.Key, out _dict, "", item.Value);//第一层分组读取(父节点)
  90. foreach (HtmlNode _item in parentData)
  91. {
  92. var _childict = new Dictionary<string, string>();
  93. //去除附加赛(附加赛没有子分组)
  94. if (_item.InnerText.Contains("附加赛"))
  95. continue;
  96. taskList.Add(Task.Run(() =>
  97. {
  98. var url = item.Value + "?seasonType=" + (parentData.IndexOf(_item) + 1);
  99. if (item.Value.Last() == '/' || item.Value.Last() == '\\')
  100. url = item.Value.Substring(0, item.Value.Length - 1) + "?seasonType=" + (parentData.IndexOf(_item) + 1);
  101. doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = url, Title = "足彩", Timeout = 10 * 1000, NotIpNumber = 200 });
  102. if (doc.DocumentNode.InnerHtml == ConfigurationManager.AppSettings["Termination"])
  103. return;
  104. var _league = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");
  105. if (_league == null)
  106. return;
  107. HtmlNode league = _league.SelectSingleNode("//*[@class='league_right_021']");
  108. if (league != null)
  109. {
  110. var childdata = league.SelectNodes(".//em");
  111. if (childdata != null && childdata.Count > 0)
  112. result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
  113. }
  114. league = _league.SelectSingleNode("//*[@id='tabs9_main_{0}']".FormatMe((parentData.IndexOf(_item) + 1)));
  115. if (league != null)
  116. {
  117. var childdata = league.SelectSingleNode(".//li[@class='current']").ParentNode.SelectNodes(".//a");
  118. if (childdata != null && childdata.Count > 0)
  119. result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
  120. }
  121. }));
  122. }
  123. }));
  124. }
  125. }));
  126. });
  127. Thread.Sleep(5 * 60 * 1000);
  128. Task.WaitAll(taskList.ToArray());
  129. var data = new List<F_Grouping>();
  130. var groupList = services.Query<F_Grouping>(" AND Season LIKE '{0}%' ".FormatMe(DateTime.Now.Year));//数据库的数据
  131. //新增的父分组、子分组
  132. var parentList = (from a in result
  133. where !(from b in groupList where string.IsNullOrEmpty(b.ParentId) && a.EventId == b.EventId select b.Name.Trim()).Contains(a.Name.Trim()) && string.IsNullOrEmpty(a.ParentId)
  134. select a).ToList();
  135. parentList.ForEach(p =>
  136. {
  137. data.Add(p);
  138. var childList = result.Where(q => q.ParentId == p.Id).ToList();
  139. childList.ForEach(q =>
  140. {
  141. data.Add(q);
  142. });
  143. });
  144. //新增的子分组
  145. groupList.Where(p => string.IsNullOrEmpty(p.ParentId)).ToList().ForEach(q =>
  146. {
  147. var oldList = groupList.Where(s => s.ParentId.Trim() == q.Id && s.EventId == q.EventId).ToList();//数据库子节点的数据
  148. var newParentId = result.Where(s => s.Name.Trim() == q.Name.Trim() && s.EventId == q.EventId && string.IsNullOrEmpty(s.ParentId)).ToList();//爬取数据子节点数据
  149. var newList = result.Where(s => s.ParentId == (newParentId.Count > 0 ? newParentId[0].Id : "0")).ToList();
  150. if (newList.Count > 0 && oldList.Count != newList.Count)
  151. {
  152. var list = (from a in newList
  153. where !(from b in oldList select b.Name.Trim()).Contains(a.Name.Trim())
  154. select a).ToList();
  155. list.ForEach(d =>
  156. {
  157. d.ParentId = q.Id;
  158. data.Add(d);
  159. });
  160. }
  161. });
  162. services.SqlBulkCopyAdd(data);
  163. }
  164. /// <summary>
  165. /// 获取所有的分组数据
  166. /// </summary>
  167. /// <param name="mainUrl"></param>
  168. public void GetALL(string mainUrl = "http://saishi.zgzcw.com/soccer/cup/", int pageIndex = 1)
  169. {
  170. // if (pageIndex > pageIndexCount)
  171. // return;
  172. ThreadPool.SetMaxThreads(150, 150);
  173. var result = new List<F_Grouping>();
  174. // var _eventList = eventList.Skip((pageIndex - 1) * pageSize).Take(pageSize).ToList();
  175. //_eventList =( from a in _eventList
  176. // where a.PK != 229
  177. // select a).ToList();
  178. eventList.ForEach(p =>
  179. {
  180. //异步加载分组
  181. Task.Run(() =>
  182. {
  183. Trace.WriteLine("个数:" + eventList.IndexOf(p));
  184. if (!p.Remark.IsEmpty())
  185. {
  186. var url = mainUrl + p.Remark.Split('/').LastOrDefault();//得到赛事编码
  187. var doc = CommonHelper.GetHtml(url, "足彩");//加载主页面
  188. var dict = new Dictionary<string, string>();//用以记录赛季的URL和名称
  189. var seasonParentData = doc.DocumentNode.SelectSingleNode("//div[@class='div-select hoverSelect']");//赛季HTML父节点
  190. if (seasonParentData != null)
  191. {
  192. var seasonData = seasonParentData.SelectNodes(".//a");//赛季a标签
  193. foreach (HtmlNode item in seasonData)
  194. {
  195. var li = item.SelectNodes(".//li");
  196. if (li.Count > 0)
  197. {
  198. var val = string.Empty;
  199. if (!dict.TryGetValue(li[0].InnerText, out val))
  200. {
  201. if (item.Attributes["href"].Value.Contains("wwaattssuunn"))
  202. continue;
  203. dict.Add(li[0].InnerText, item.Attributes["href"].Value);//名称,URL
  204. }
  205. }
  206. }
  207. foreach (var item in dict)
  208. {
  209. Task.Run(() =>
  210. {
  211. if (!item.Value.Contains("-"))
  212. {
  213. doc = CommonHelper.GetHtml(item.Value, "足彩");
  214. var nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层
  215. if (nodeList != null)
  216. {
  217. #region table_head cup-t,table_head两个层选择判断
  218. var name = "//div[@class='table_head cup-t']";
  219. var table_head = nodeList.SelectNodes("//div[@class='table_head cup-t']");
  220. if (table_head == null)
  221. {
  222. table_head = nodeList.SelectNodes("//div[@class='table_head']");
  223. name = "//div[@class='table_head']";
  224. }
  225. var parentDiv = nodeList.SelectSingleNode(name);
  226. var parentli = parentDiv;
  227. if (parentDiv != null)
  228. parentli = parentDiv.SelectSingleNode(".//li[@class='cur']");
  229. var number = 0;
  230. while (parentDiv == null || parentli == null)
  231. {
  232. number++;
  233. if (number > 20)
  234. break;
  235. doc = CommonHelper.GetHtml(item.Value, "足彩");
  236. nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层
  237. if (nodeList != null)
  238. {
  239. parentDiv = nodeList.SelectSingleNode(name);
  240. if (parentDiv != null)
  241. parentli = parentDiv.SelectSingleNode(".//li[@class='cur']");
  242. }
  243. else
  244. parentDiv = null;
  245. }
  246. if (parentli != null)
  247. {
  248. var parentData = parentli.ParentNode.SelectNodes("li");
  249. #endregion
  250. if (parentData != null)
  251. {
  252. var _dict = new Dictionary<string, string>();
  253. result = SetData(result, parentData, p.Id, item.Key, out _dict, "", item.Value);//第一层分组读取
  254. foreach (HtmlNode _item in parentData)
  255. {
  256. var _childict = new Dictionary<string, string>();
  257. //去除附加赛(附加赛没有子分组)
  258. if (!_item.InnerText.Contains("附加赛"))
  259. {
  260. Task.Run(() =>
  261. {
  262. var _url = item.Value + "?seasonType=" + (parentData.IndexOf(_item) + 1);
  263. if (item.Value.Last() == '/' || item.Value.Last() == '\\')
  264. _url = item.Value.Substring(0, item.Value.Length - 1) + "?seasonType=" + (parentData.IndexOf(_item) + 1);
  265. doc = CommonHelper.GetHtml(_url, "足彩");
  266. var _league = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");
  267. if (_league != null)
  268. {
  269. var league = _league.SelectSingleNode("//*[@class='league_right_021']");
  270. if (league != null)
  271. {
  272. var childdata = league.SelectNodes(".//em");
  273. if (childdata != null && childdata.Count > 0)
  274. {
  275. result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
  276. }
  277. }
  278. league = _league.SelectSingleNode("//*[@Id='tabs9_main_']" + (parentData.IndexOf(_item) + 1));
  279. if (league != null)
  280. {
  281. var childdata = league.SelectSingleNode(".//li[@class='current']").ParentNode.SelectNodes(".//a");
  282. if (childdata != null && childdata.Count > 0)
  283. {
  284. result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
  285. }
  286. }
  287. }
  288. });
  289. }
  290. }
  291. }
  292. }
  293. }
  294. }
  295. });
  296. }
  297. }
  298. }
  299. });
  300. });
  301. while (true)
  302. {
  303. if (CommonHelper.ThreadsFinsh())
  304. break;
  305. }
  306. services.SqlBulkCopyAdd<F_Grouping>(result);
  307. GetALL(mainUrl, pageIndex + 1);
  308. }
  309. /// <summary>
  310. /// 保存数据
  311. /// </summary>
  312. /// <param name="list"></param>
  313. /// <param name="data"></param>
  314. /// <param name="eventId"></param>
  315. /// <param name="season"></param>
  316. /// <param name="parentDict"></param>
  317. /// <param name="parentId"></param>
  318. /// <param name="remark"></param>
  319. /// <returns></returns>
  320. private List<F_Grouping> SetData(List<F_Grouping> list, HtmlNodeCollection data, string eventId, string season, out Dictionary<string, string> parentDict, string parentId = "", string remark = "")
  321. {
  322. parentDict = new Dictionary<string, string>();
  323. foreach (HtmlNode item in data)
  324. {
  325. if (item.InnerText.Trim() == ("全部"))
  326. continue;
  327. var model = new F_Grouping
  328. {
  329. Id = CommonHelper.GetGuid().ToString(),
  330. Name = item.InnerText.Trim(),
  331. ParentId = parentId ?? null,
  332. EventId = eventId,
  333. Season = season,
  334. Remark = remark,
  335. CreateDateTime = DateTime.Now
  336. };
  337. if (parentDict.ContainsKey(item.InnerText))
  338. continue;
  339. parentDict.Add(item.InnerText, model.Id);
  340. list.Add(model);
  341. }
  342. return list;
  343. }
  344. }
  345. }