GroupingJob.cs 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Text;
  5. using FCS.Common;
  6. using FCS.Interface;
  7. using FCS.Models;
  8. using HtmlAgilityPack;
  9. using Quartz;
  10. using System.Threading.Tasks;
  11. using System.Diagnostics;
  12. using System.Threading;
  13. using System.Web;
  14. using System.IO;
  15. using Newtonsoft.Json;
  16. using System.Net;
  17. using FCS.Crawler.ZCLotteryIP;
  18. using System.Configuration;
  19. using FCS.Models.DTO;
  20. namespace FCS.Crawler.ZCLotteryGrouping
  21. {
  22. public class GroupingJob : CommonJob
  23. {
  24. private List<F_Events> eventList;//赛事信息
  25. public GroupingJob()
  26. {
  27. eventList = services.Query<F_Events>().ToList();
  28. //eventList = (from a in eventList
  29. // where a.PK == 229
  30. // select a).ToList();
  31. // pageSize = int.Parse(ConfigurationManager.AppSettings["PagSize"]);
  32. // pageIndexCount = eventList.Count % pageSize > 0
  33. // ? eventList.Count / pageSize + 1
  34. // : eventList.Count / pageSize;
  35. }
  36. /// <summary>
  37. /// 分组更新任务
  38. /// </summary>
  39. /// <param name="mainUrl"></param>
  40. /// <param name="isUpdate"></param>
  41. public void Click(string mainUrl = "http://saishi.zgzcw.com", bool isUpdate = false)
  42. {
  43. var result = new List<F_Grouping>();
  44. eventList.ForEach(p =>
  45. {
  46. //异步加载分组
  47. taskList.Add(Task.Run(() =>
  48. {
  49. #region 获取期数、赛季
  50. var doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = mainUrl + p.Remark, Title = "足彩" });//加载主页面
  51. var dict = new Dictionary<string, string>();//用以记录赛季的URL和名称
  52. var seasonParentData = doc.DocumentNode.SelectSingleNode("//div[@class='div-select hoverSelect']");//赛季HTML父节点
  53. if (seasonParentData == null || doc.DocumentNode.InnerHtml == ConfigurationManager.AppSettings["Termination"])
  54. return;
  55. var seasonData = seasonParentData.SelectNodes(".//a");//赛季a标签
  56. foreach (HtmlNode item in seasonData)
  57. {
  58. var li = item.SelectNodes(".//li");
  59. if (li.Count <= 0)
  60. continue;
  61. if (li[0].InnerText.Contains(DateTime.Now.Year.ToString())
  62. && !li[0].InnerText.Contains(DateTime.Now.AddYears(-1).Year.ToString())
  63. && !dict.ContainsKey(li[0].InnerText))
  64. dict.Add(li[0].InnerText, item.Attributes["href"].Value);//名称,URL
  65. }
  66. #endregion
  67. foreach (var item in dict)
  68. {
  69. taskList.Add(Task.Run(() =>
  70. {
  71. doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = item.Value, Title = "足彩" });
  72. var nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层
  73. if (nodeList == null || doc.DocumentNode.InnerHtml == ConfigurationManager.AppSettings["Termination"])
  74. return;
  75. #region table_head cup-t,table_head两个层选择判断
  76. var name = "//div[@class='table_head cup-t']";
  77. var table_head = nodeList.SelectNodes("//div[@class='table_head cup-t']");
  78. if (table_head == null)
  79. {
  80. table_head = nodeList.SelectNodes("//div[@class='table_head']");
  81. name = "//div[@class='table_head']";
  82. }
  83. var parentData = nodeList.SelectSingleNode(name).SelectSingleNode("./ul").SelectNodes(".//li");
  84. #endregion
  85. if (parentData == null)
  86. return;
  87. var _dict = new Dictionary<string, string>();//用以记录新增的父节点的Id
  88. result = SetData(result, parentData, p.Id, item.Key, out _dict, "", item.Value);//第一层分组读取(父节点)
  89. foreach (HtmlNode _item in parentData)
  90. {
  91. var _childict = new Dictionary<string, string>();
  92. //去除附加赛(附加赛没有子分组)
  93. if (_item.InnerText.Contains("附加赛"))
  94. continue;
  95. taskList.Add(Task.Run(() =>
  96. {
  97. var url = item.Value + "?seasonType=" + (parentData.IndexOf(_item) + 1);
  98. if (item.Value.Last() == '/' || item.Value.Last() == '\\')
  99. url = item.Value.Substring(0, item.Value.Length - 1) + "?seasonType=" + (parentData.IndexOf(_item) + 1);
  100. doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = url, Title = "足彩" });
  101. if (doc.DocumentNode.InnerHtml != ConfigurationManager.AppSettings["Termination"])
  102. {
  103. var _league = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");
  104. if (_league == null)
  105. return;
  106. HtmlNode league = _league.SelectSingleNode("//*[@class='league_right_021']");
  107. if (league != null)
  108. {
  109. var childdata = league.SelectNodes(".//em");
  110. if (childdata != null && childdata.Count > 0)
  111. result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
  112. }
  113. league = _league.SelectSingleNode("//*[@id='tabs9_main_{0}']".FormatMe((parentData.IndexOf(_item) + 1)));
  114. if (league != null)
  115. {
  116. var childdata = league.SelectSingleNode(".//li[@class='current']").ParentNode.SelectNodes(".//a");
  117. if (childdata != null && childdata.Count > 0)
  118. result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
  119. }
  120. }
  121. }));
  122. }
  123. }));
  124. }
  125. }));
  126. });
  127. Task.WaitAll(taskList.ToArray());
  128. var data = new List<F_Grouping>();
  129. var groupList = services.Query<F_Grouping>(" AND Season LIKE '{0}%' ".FormatMe(DateTime.Now.Year));//数据库的数据
  130. //新增的父分组、子分组
  131. var parentList = (from a in result
  132. where !(from b in groupList where string.IsNullOrEmpty(b.ParentId) && a.EventId == b.EventId select b.Name.Trim()).Contains(a.Name.Trim()) && string.IsNullOrEmpty(a.ParentId)
  133. select a).ToList();
  134. parentList.ForEach(p =>
  135. {
  136. data.Add(p);
  137. var childList = result.Where(q => q.ParentId == p.Id).ToList();
  138. childList.ForEach(q =>
  139. {
  140. data.Add(q);
  141. });
  142. });
  143. //新增的子分组
  144. groupList.Where(p => string.IsNullOrEmpty(p.ParentId)).ToList().ForEach(q =>
  145. {
  146. var oldList = groupList.Where(s => s.ParentId.Trim() == q.Id && s.EventId == q.EventId).ToList();//数据库子节点的数据
  147. var newParentId = result.Where(s => s.Name.Trim() == q.Name.Trim() && s.EventId == q.EventId && string.IsNullOrEmpty(s.ParentId)).ToList();//爬取数据子节点数据
  148. var newList = result.Where(s => s.ParentId == (newParentId.Count > 0 ? newParentId[0].Id : "0")).ToList();
  149. if (newList.Count > 0 && oldList.Count != newList.Count)
  150. {
  151. var list = (from a in newList
  152. where !(from b in oldList select b.Name.Trim()).Contains(a.Name.Trim())
  153. select a).ToList();
  154. list.ForEach(d =>
  155. {
  156. d.ParentId = q.Id;
  157. data.Add(d);
  158. });
  159. }
  160. });
  161. services.SqlBulkCopyAdd(data);
  162. }
  163. /// <summary>
  164. /// 获取所有的分组数据
  165. /// </summary>
  166. /// <param name="mainUrl"></param>
  167. public void GetALL(string mainUrl = "http://saishi.zgzcw.com/soccer/cup/", int pageIndex = 1)
  168. {
  169. // if (pageIndex > pageIndexCount)
  170. // return;
  171. ThreadPool.SetMaxThreads(150, 150);
  172. var result = new List<F_Grouping>();
  173. // var _eventList = eventList.Skip((pageIndex - 1) * pageSize).Take(pageSize).ToList();
  174. //_eventList =( from a in _eventList
  175. // where a.PK != 229
  176. // select a).ToList();
  177. eventList.ForEach(p =>
  178. {
  179. //异步加载分组
  180. Task.Run(() =>
  181. {
  182. Trace.WriteLine("个数:" + eventList.IndexOf(p));
  183. if (!p.Remark.IsEmpty())
  184. {
  185. var url = mainUrl + p.Remark.Split('/').LastOrDefault();//得到赛事编码
  186. var doc = CommonHelper.GetHtml(url, "足彩");//加载主页面
  187. var dict = new Dictionary<string, string>();//用以记录赛季的URL和名称
  188. var seasonParentData = doc.DocumentNode.SelectSingleNode("//div[@class='div-select hoverSelect']");//赛季HTML父节点
  189. if (seasonParentData != null)
  190. {
  191. var seasonData = seasonParentData.SelectNodes(".//a");//赛季a标签
  192. foreach (HtmlNode item in seasonData)
  193. {
  194. var li = item.SelectNodes(".//li");
  195. if (li.Count > 0)
  196. {
  197. var val = string.Empty;
  198. if (!dict.TryGetValue(li[0].InnerText, out val))
  199. {
  200. if (item.Attributes["href"].Value.Contains("wwaattssuunn"))
  201. continue;
  202. dict.Add(li[0].InnerText, item.Attributes["href"].Value);//名称,URL
  203. }
  204. }
  205. }
  206. foreach (var item in dict)
  207. {
  208. Task.Run(() =>
  209. {
  210. if (!item.Value.Contains("-"))
  211. {
  212. doc = CommonHelper.GetHtml(item.Value, "足彩");
  213. var nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层
  214. if (nodeList != null)
  215. {
  216. #region table_head cup-t,table_head两个层选择判断
  217. var name = "//div[@class='table_head cup-t']";
  218. var table_head = nodeList.SelectNodes("//div[@class='table_head cup-t']");
  219. if (table_head == null)
  220. {
  221. table_head = nodeList.SelectNodes("//div[@class='table_head']");
  222. name = "//div[@class='table_head']";
  223. }
  224. var parentDiv = nodeList.SelectSingleNode(name);
  225. var parentli = parentDiv;
  226. if (parentDiv != null)
  227. parentli = parentDiv.SelectSingleNode(".//li[@class='cur']");
  228. var number = 0;
  229. while (parentDiv == null || parentli == null)
  230. {
  231. number++;
  232. if (number > 20)
  233. break;
  234. doc = CommonHelper.GetHtml(item.Value, "足彩");
  235. nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层
  236. if (nodeList != null)
  237. {
  238. parentDiv = nodeList.SelectSingleNode(name);
  239. if (parentDiv != null)
  240. parentli = parentDiv.SelectSingleNode(".//li[@class='cur']");
  241. }
  242. else
  243. parentDiv = null;
  244. }
  245. if (parentli != null)
  246. {
  247. var parentData = parentli.ParentNode.SelectNodes("li");
  248. #endregion
  249. if (parentData != null)
  250. {
  251. var _dict = new Dictionary<string, string>();
  252. result = SetData(result, parentData, p.Id, item.Key, out _dict, "", item.Value);//第一层分组读取
  253. foreach (HtmlNode _item in parentData)
  254. {
  255. var _childict = new Dictionary<string, string>();
  256. //去除附加赛(附加赛没有子分组)
  257. if (!_item.InnerText.Contains("附加赛"))
  258. {
  259. Task.Run(() =>
  260. {
  261. var _url = item.Value + "?seasonType=" + (parentData.IndexOf(_item) + 1);
  262. if (item.Value.Last() == '/' || item.Value.Last() == '\\')
  263. _url = item.Value.Substring(0, item.Value.Length - 1) + "?seasonType=" + (parentData.IndexOf(_item) + 1);
  264. doc = CommonHelper.GetHtml(_url, "足彩");
  265. var _league = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");
  266. if (_league != null)
  267. {
  268. var league = _league.SelectSingleNode("//*[@class='league_right_021']");
  269. if (league != null)
  270. {
  271. var childdata = league.SelectNodes(".//em");
  272. if (childdata != null && childdata.Count > 0)
  273. {
  274. result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
  275. }
  276. }
  277. league = _league.SelectSingleNode("//*[@Id='tabs9_main_']" + (parentData.IndexOf(_item) + 1));
  278. if (league != null)
  279. {
  280. var childdata = league.SelectSingleNode(".//li[@class='current']").ParentNode.SelectNodes(".//a");
  281. if (childdata != null && childdata.Count > 0)
  282. {
  283. result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
  284. }
  285. }
  286. }
  287. });
  288. }
  289. }
  290. }
  291. }
  292. }
  293. }
  294. });
  295. }
  296. }
  297. }
  298. });
  299. });
  300. while (true)
  301. {
  302. if (CommonHelper.ThreadsFinsh())
  303. break;
  304. }
  305. services.SqlBulkCopyAdd<F_Grouping>(result);
  306. GetALL(mainUrl, pageIndex + 1);
  307. }
  308. /// <summary>
  309. /// 保存数据
  310. /// </summary>
  311. /// <param name="list"></param>
  312. /// <param name="data"></param>
  313. /// <param name="eventId"></param>
  314. /// <param name="season"></param>
  315. /// <param name="parentDict"></param>
  316. /// <param name="parentId"></param>
  317. /// <param name="remark"></param>
  318. /// <returns></returns>
  319. private List<F_Grouping> SetData(List<F_Grouping> list, HtmlNodeCollection data, string eventId, string season, out Dictionary<string, string> parentDict, string parentId = "", string remark = "")
  320. {
  321. parentDict = new Dictionary<string, string>();
  322. foreach (HtmlNode item in data)
  323. {
  324. if (item.InnerText.Trim() == ("全部"))
  325. continue;
  326. var model = new F_Grouping
  327. {
  328. Id = CommonHelper.GetGuid().ToString(),
  329. Name = item.InnerText.Trim(),
  330. ParentId = parentId ?? null,
  331. EventId = eventId,
  332. Season = season,
  333. Remark = remark,
  334. CreateDateTime = DateTime.Now
  335. };
  336. if (parentDict.ContainsKey(item.InnerText))
  337. continue;
  338. parentDict.Add(item.InnerText, model.Id);
  339. list.Add(model);
  340. }
  341. return list;
  342. }
  343. }
  344. }