using System; using System.Collections.Generic; using System.Linq; using System.Text; using FCS.Common; using FCS.Interface; using FCS.Models; using HtmlAgilityPack; using Quartz; using System.Threading.Tasks; using System.Diagnostics; using System.Threading; using System.Web; using System.IO; using Newtonsoft.Json; using System.Net; using FCS.Crawler.ZCLotteryIP; using System.Configuration; using FCS.Models.DTO; namespace FCS.Crawler.ZCLotteryGrouping { public class GroupingJob : CommonJob { private List eventList;//赛事信息 public GroupingJob() { eventList = services.Query().ToList(); //eventList = (from a in eventList // where a.PK == 229 // select a).ToList(); // pageSize = int.Parse(ConfigurationManager.AppSettings["PagSize"]); // pageIndexCount = eventList.Count % pageSize > 0 // ? eventList.Count / pageSize + 1 // : eventList.Count / pageSize; } /// /// 分组更新任务 /// /// /// public void Click(string mainUrl = "http://saishi.zgzcw.com", bool isUpdate = false) { var result = new List(); eventList.ForEach(p => { //异步加载分组 taskList.Add(Task.Run(() => { #region 获取期数、赛季 var doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = mainUrl + p.Remark, Title = "足彩" });//加载主页面 var dict = new Dictionary();//用以记录赛季的URL和名称 var seasonParentData = doc.DocumentNode.SelectSingleNode("//div[@class='div-select hoverSelect']");//赛季HTML父节点 if (seasonParentData == null || doc.DocumentNode.InnerHtml == ConfigurationManager.AppSettings["Termination"]) return; var seasonData = seasonParentData.SelectNodes(".//a");//赛季a标签 foreach (HtmlNode item in seasonData) { var li = item.SelectNodes(".//li"); if (li.Count <= 0) continue; if (li[0].InnerText.Contains(DateTime.Now.Year.ToString()) && !li[0].InnerText.Contains(DateTime.Now.AddYears(-1).Year.ToString()) && !dict.ContainsKey(li[0].InnerText)) dict.Add(li[0].InnerText, item.Attributes["href"].Value);//名称,URL } #endregion foreach (var item in dict) { taskList.Add(Task.Run(() => { doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = item.Value, Title = "足彩" }); var nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层 if (nodeList == null || doc.DocumentNode.InnerHtml == ConfigurationManager.AppSettings["Termination"]) return; #region table_head cup-t,table_head两个层选择判断 var name = "//div[@class='table_head cup-t']"; var table_head = nodeList.SelectNodes("//div[@class='table_head cup-t']"); if (table_head == null) { table_head = nodeList.SelectNodes("//div[@class='table_head']"); name = "//div[@class='table_head']"; } var parentData = nodeList.SelectSingleNode(name).SelectSingleNode("./ul").SelectNodes(".//li"); #endregion if (parentData == null) return; var _dict = new Dictionary();//用以记录新增的父节点的Id result = SetData(result, parentData, p.Id, item.Key, out _dict, "", item.Value);//第一层分组读取(父节点) foreach (HtmlNode _item in parentData) { var _childict = new Dictionary(); //去除附加赛(附加赛没有子分组) if (_item.InnerText.Contains("附加赛")) continue; taskList.Add(Task.Run(() => { var url = item.Value + "?seasonType=" + (parentData.IndexOf(_item) + 1); if (item.Value.Last() == '/' || item.Value.Last() == '\\') url = item.Value.Substring(0, item.Value.Length - 1) + "?seasonType=" + (parentData.IndexOf(_item) + 1); doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = url, Title = "足彩" }); if (doc.DocumentNode.InnerHtml != ConfigurationManager.AppSettings["Termination"]) { var _league = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']"); if (_league == null) return; HtmlNode league = _league.SelectSingleNode("//*[@class='league_right_021']"); if (league != null) { var childdata = league.SelectNodes(".//em"); if (childdata != null && childdata.Count > 0) result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value); } league = _league.SelectSingleNode("//*[@id='tabs9_main_{0}']".FormatMe((parentData.IndexOf(_item) + 1))); if (league != null) { var childdata = league.SelectSingleNode(".//li[@class='current']").ParentNode.SelectNodes(".//a"); if (childdata != null && childdata.Count > 0) result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value); } } })); } })); } })); }); Task.WaitAll(taskList.ToArray()); var data = new List(); var groupList = services.Query(" AND Season LIKE '{0}%' ".FormatMe(DateTime.Now.Year));//数据库的数据 //新增的父分组、子分组 var parentList = (from a in result where !(from b in groupList where string.IsNullOrEmpty(b.ParentId) && a.EventId == b.EventId select b.Name.Trim()).Contains(a.Name.Trim()) && string.IsNullOrEmpty(a.ParentId) select a).ToList(); parentList.ForEach(p => { data.Add(p); var childList = result.Where(q => q.ParentId == p.Id).ToList(); childList.ForEach(q => { data.Add(q); }); }); //新增的子分组 groupList.Where(p => string.IsNullOrEmpty(p.ParentId)).ToList().ForEach(q => { var oldList = groupList.Where(s => s.ParentId.Trim() == q.Id && s.EventId == q.EventId).ToList();//数据库子节点的数据 var newParentId = result.Where(s => s.Name.Trim() == q.Name.Trim() && s.EventId == q.EventId && string.IsNullOrEmpty(s.ParentId)).ToList();//爬取数据子节点数据 var newList = result.Where(s => s.ParentId == (newParentId.Count > 0 ? newParentId[0].Id : "0")).ToList(); if (newList.Count > 0 && oldList.Count != newList.Count) { var list = (from a in newList where !(from b in oldList select b.Name.Trim()).Contains(a.Name.Trim()) select a).ToList(); list.ForEach(d => { d.ParentId = q.Id; data.Add(d); }); } }); services.SqlBulkCopyAdd(data); } /// /// 获取所有的分组数据 /// /// public void GetALL(string mainUrl = "http://saishi.zgzcw.com/soccer/cup/", int pageIndex = 1) { // if (pageIndex > pageIndexCount) // return; ThreadPool.SetMaxThreads(150, 150); var result = new List(); // var _eventList = eventList.Skip((pageIndex - 1) * pageSize).Take(pageSize).ToList(); //_eventList =( from a in _eventList // where a.PK != 229 // select a).ToList(); eventList.ForEach(p => { //异步加载分组 Task.Run(() => { Trace.WriteLine("个数:" + eventList.IndexOf(p)); if (!p.Remark.IsEmpty()) { var url = mainUrl + p.Remark.Split('/').LastOrDefault();//得到赛事编码 var doc = CommonHelper.GetHtml(url, "足彩");//加载主页面 var dict = new Dictionary();//用以记录赛季的URL和名称 var seasonParentData = doc.DocumentNode.SelectSingleNode("//div[@class='div-select hoverSelect']");//赛季HTML父节点 if (seasonParentData != null) { var seasonData = seasonParentData.SelectNodes(".//a");//赛季a标签 foreach (HtmlNode item in seasonData) { var li = item.SelectNodes(".//li"); if (li.Count > 0) { var val = string.Empty; if (!dict.TryGetValue(li[0].InnerText, out val)) { if (item.Attributes["href"].Value.Contains("wwaattssuunn")) continue; dict.Add(li[0].InnerText, item.Attributes["href"].Value);//名称,URL } } } foreach (var item in dict) { Task.Run(() => { if (!item.Value.Contains("-")) { doc = CommonHelper.GetHtml(item.Value, "足彩"); var nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层 if (nodeList != null) { #region table_head cup-t,table_head两个层选择判断 var name = "//div[@class='table_head cup-t']"; var table_head = nodeList.SelectNodes("//div[@class='table_head cup-t']"); if (table_head == null) { table_head = nodeList.SelectNodes("//div[@class='table_head']"); name = "//div[@class='table_head']"; } var parentDiv = nodeList.SelectSingleNode(name); var parentli = parentDiv; if (parentDiv != null) parentli = parentDiv.SelectSingleNode(".//li[@class='cur']"); var number = 0; while (parentDiv == null || parentli == null) { number++; if (number > 20) break; doc = CommonHelper.GetHtml(item.Value, "足彩"); nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层 if (nodeList != null) { parentDiv = nodeList.SelectSingleNode(name); if (parentDiv != null) parentli = parentDiv.SelectSingleNode(".//li[@class='cur']"); } else parentDiv = null; } if (parentli != null) { var parentData = parentli.ParentNode.SelectNodes("li"); #endregion if (parentData != null) { var _dict = new Dictionary(); result = SetData(result, parentData, p.Id, item.Key, out _dict, "", item.Value);//第一层分组读取 foreach (HtmlNode _item in parentData) { var _childict = new Dictionary(); //去除附加赛(附加赛没有子分组) if (!_item.InnerText.Contains("附加赛")) { Task.Run(() => { var _url = item.Value + "?seasonType=" + (parentData.IndexOf(_item) + 1); if (item.Value.Last() == '/' || item.Value.Last() == '\\') _url = item.Value.Substring(0, item.Value.Length - 1) + "?seasonType=" + (parentData.IndexOf(_item) + 1); doc = CommonHelper.GetHtml(_url, "足彩"); var _league = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']"); if (_league != null) { var league = _league.SelectSingleNode("//*[@class='league_right_021']"); if (league != null) { var childdata = league.SelectNodes(".//em"); if (childdata != null && childdata.Count > 0) { result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value); } } league = _league.SelectSingleNode("//*[@Id='tabs9_main_']" + (parentData.IndexOf(_item) + 1)); if (league != null) { var childdata = league.SelectSingleNode(".//li[@class='current']").ParentNode.SelectNodes(".//a"); if (childdata != null && childdata.Count > 0) { result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value); } } } }); } } } } } } }); } } } }); }); while (true) { if (CommonHelper.ThreadsFinsh()) break; } services.SqlBulkCopyAdd(result); GetALL(mainUrl, pageIndex + 1); } /// /// 保存数据 /// /// /// /// /// /// /// /// /// private List SetData(List list, HtmlNodeCollection data, string eventId, string season, out Dictionary parentDict, string parentId = "", string remark = "") { parentDict = new Dictionary(); foreach (HtmlNode item in data) { if (item.InnerText.Trim() == ("全部")) continue; var model = new F_Grouping { Id = CommonHelper.GetGuid().ToString(), Name = item.InnerText.Trim(), ParentId = parentId ?? null, EventId = eventId, Season = season, Remark = remark, CreateDateTime = DateTime.Now }; if (parentDict.ContainsKey(item.InnerText)) continue; parentDict.Add(item.InnerText, model.Id); list.Add(model); } return list; } } }