123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371 |
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using FCS.Common;
- using FCS.Interface;
- using FCS.Models;
- using HtmlAgilityPack;
- using Quartz;
- using System.Threading.Tasks;
- using System.Diagnostics;
- using System.Threading;
- using System.Web;
- using System.IO;
- using Newtonsoft.Json;
- using System.Net;
- using FCS.Crawler.ZCLotteryIP;
- using System.Configuration;
- using FCS.Models.DTO;
- namespace FCS.Crawler.ZCLotteryGrouping
- {
- public class GroupingJob : CommonJob
- {
- private List<F_Events> eventList;//赛事信息
- public GroupingJob()
- {
- eventList = services.Query<F_Events>().ToList();
- //eventList = (from a in eventList
- // where a.PK == 229
- // select a).ToList();
- // pageSize = int.Parse(ConfigurationManager.AppSettings["PagSize"]);
- // pageIndexCount = eventList.Count % pageSize > 0
- // ? eventList.Count / pageSize + 1
- // : eventList.Count / pageSize;
- }
- /// <summary>
- /// 分组更新任务
- /// </summary>
- /// <param name="mainUrl"></param>
- /// <param name="isUpdate"></param>
- public void Click(string mainUrl = "http://saishi.zgzcw.com", bool isUpdate = false)
- {
- var result = new List<F_Grouping>();
- eventList.ForEach(p =>
- {
- //异步加载分组
- taskList.Add(Task.Run(() =>
- {
- #region 获取期数、赛季
- var doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = mainUrl + p.Remark, Title = "足彩" });//加载主页面
- var dict = new Dictionary<string, string>();//用以记录赛季的URL和名称
- var seasonParentData = doc.DocumentNode.SelectSingleNode("//div[@class='div-select hoverSelect']");//赛季HTML父节点
- if (seasonParentData == null || doc.DocumentNode.InnerHtml == ConfigurationManager.AppSettings["Termination"])
- return;
- var seasonData = seasonParentData.SelectNodes(".//a");//赛季a标签
- foreach (HtmlNode item in seasonData)
- {
- var li = item.SelectNodes(".//li");
- if (li.Count <= 0)
- continue;
- if (li[0].InnerText.Contains(DateTime.Now.Year.ToString())
- && !li[0].InnerText.Contains(DateTime.Now.AddYears(-1).Year.ToString())
- && !dict.ContainsKey(li[0].InnerText))
- dict.Add(li[0].InnerText, item.Attributes["href"].Value);//名称,URL
- }
- #endregion
- foreach (var item in dict)
- {
- taskList.Add(Task.Run(() =>
- {
- doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = item.Value, Title = "足彩" });
- var nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层
- if (nodeList == null || doc.DocumentNode.InnerHtml == ConfigurationManager.AppSettings["Termination"])
- return;
- #region table_head cup-t,table_head两个层选择判断
- var name = "//div[@class='table_head cup-t']";
- var table_head = nodeList.SelectNodes("//div[@class='table_head cup-t']");
- if (table_head == null)
- {
- table_head = nodeList.SelectNodes("//div[@class='table_head']");
- name = "//div[@class='table_head']";
- }
- var parentData = nodeList.SelectSingleNode(name).SelectSingleNode("./ul").SelectNodes(".//li");
- #endregion
- if (parentData == null)
- return;
- var _dict = new Dictionary<string, string>();//用以记录新增的父节点的Id
- result = SetData(result, parentData, p.Id, item.Key, out _dict, "", item.Value);//第一层分组读取(父节点)
- foreach (HtmlNode _item in parentData)
- {
- var _childict = new Dictionary<string, string>();
- //去除附加赛(附加赛没有子分组)
- if (_item.InnerText.Contains("附加赛"))
- continue;
- taskList.Add(Task.Run(() =>
- {
- var url = item.Value + "?seasonType=" + (parentData.IndexOf(_item) + 1);
- if (item.Value.Last() == '/' || item.Value.Last() == '\\')
- url = item.Value.Substring(0, item.Value.Length - 1) + "?seasonType=" + (parentData.IndexOf(_item) + 1);
- doc = CommonHelper.GetHtmlHtmlDocument(new HtmlParameterDTO { Url = url, Title = "足彩" });
- if (doc.DocumentNode.InnerHtml != ConfigurationManager.AppSettings["Termination"])
- {
- var _league = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");
- if (_league == null)
- return;
- HtmlNode league = _league.SelectSingleNode("//*[@class='league_right_021']");
- if (league != null)
- {
- var childdata = league.SelectNodes(".//em");
- if (childdata != null && childdata.Count > 0)
- result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
- }
- league = _league.SelectSingleNode("//*[@id='tabs9_main_{0}']".FormatMe((parentData.IndexOf(_item) + 1)));
- if (league != null)
- {
- var childdata = league.SelectSingleNode(".//li[@class='current']").ParentNode.SelectNodes(".//a");
- if (childdata != null && childdata.Count > 0)
- result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
- }
- }
- }));
- }
- }));
- }
- }));
- });
- Task.WaitAll(taskList.ToArray());
- var data = new List<F_Grouping>();
- var groupList = services.Query<F_Grouping>(" AND Season LIKE '{0}%' ".FormatMe(DateTime.Now.Year));//数据库的数据
- //新增的父分组、子分组
- var parentList = (from a in result
- where !(from b in groupList where string.IsNullOrEmpty(b.ParentId) && a.EventId == b.EventId select b.Name.Trim()).Contains(a.Name.Trim()) && string.IsNullOrEmpty(a.ParentId)
- select a).ToList();
- parentList.ForEach(p =>
- {
- data.Add(p);
- var childList = result.Where(q => q.ParentId == p.Id).ToList();
- childList.ForEach(q =>
- {
- data.Add(q);
- });
- });
- //新增的子分组
- groupList.Where(p => string.IsNullOrEmpty(p.ParentId)).ToList().ForEach(q =>
- {
- var oldList = groupList.Where(s => s.ParentId.Trim() == q.Id && s.EventId == q.EventId).ToList();//数据库子节点的数据
- var newParentId = result.Where(s => s.Name.Trim() == q.Name.Trim() && s.EventId == q.EventId && string.IsNullOrEmpty(s.ParentId)).ToList();//爬取数据子节点数据
- var newList = result.Where(s => s.ParentId == (newParentId.Count > 0 ? newParentId[0].Id : "0")).ToList();
- if (newList.Count > 0 && oldList.Count != newList.Count)
- {
- var list = (from a in newList
- where !(from b in oldList select b.Name.Trim()).Contains(a.Name.Trim())
- select a).ToList();
- list.ForEach(d =>
- {
- d.ParentId = q.Id;
- data.Add(d);
- });
- }
- });
- services.SqlBulkCopyAdd(data);
- }
- /// <summary>
- /// 获取所有的分组数据
- /// </summary>
- /// <param name="mainUrl"></param>
- public void GetALL(string mainUrl = "http://saishi.zgzcw.com/soccer/cup/", int pageIndex = 1)
- {
- // if (pageIndex > pageIndexCount)
- // return;
- ThreadPool.SetMaxThreads(150, 150);
- var result = new List<F_Grouping>();
- // var _eventList = eventList.Skip((pageIndex - 1) * pageSize).Take(pageSize).ToList();
- //_eventList =( from a in _eventList
- // where a.PK != 229
- // select a).ToList();
- eventList.ForEach(p =>
- {
- //异步加载分组
- Task.Run(() =>
- {
- Trace.WriteLine("个数:" + eventList.IndexOf(p));
- if (!p.Remark.IsEmpty())
- {
- var url = mainUrl + p.Remark.Split('/').LastOrDefault();//得到赛事编码
- var doc = CommonHelper.GetHtml(url, "足彩");//加载主页面
- var dict = new Dictionary<string, string>();//用以记录赛季的URL和名称
- var seasonParentData = doc.DocumentNode.SelectSingleNode("//div[@class='div-select hoverSelect']");//赛季HTML父节点
- if (seasonParentData != null)
- {
- var seasonData = seasonParentData.SelectNodes(".//a");//赛季a标签
- foreach (HtmlNode item in seasonData)
- {
- var li = item.SelectNodes(".//li");
- if (li.Count > 0)
- {
- var val = string.Empty;
- if (!dict.TryGetValue(li[0].InnerText, out val))
- {
- if (item.Attributes["href"].Value.Contains("wwaattssuunn"))
- continue;
- dict.Add(li[0].InnerText, item.Attributes["href"].Value);//名称,URL
- }
- }
- }
- foreach (var item in dict)
- {
- Task.Run(() =>
- {
- if (!item.Value.Contains("-"))
- {
- doc = CommonHelper.GetHtml(item.Value, "足彩");
- var nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层
- if (nodeList != null)
- {
- #region table_head cup-t,table_head两个层选择判断
- var name = "//div[@class='table_head cup-t']";
- var table_head = nodeList.SelectNodes("//div[@class='table_head cup-t']");
- if (table_head == null)
- {
- table_head = nodeList.SelectNodes("//div[@class='table_head']");
- name = "//div[@class='table_head']";
- }
- var parentDiv = nodeList.SelectSingleNode(name);
- var parentli = parentDiv;
- if (parentDiv != null)
- parentli = parentDiv.SelectSingleNode(".//li[@class='cur']");
- var number = 0;
- while (parentDiv == null || parentli == null)
- {
- number++;
- if (number > 20)
- break;
- doc = CommonHelper.GetHtml(item.Value, "足彩");
- nodeList = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");//分组的HTML层
- if (nodeList != null)
- {
- parentDiv = nodeList.SelectSingleNode(name);
- if (parentDiv != null)
- parentli = parentDiv.SelectSingleNode(".//li[@class='cur']");
- }
- else
- parentDiv = null;
- }
- if (parentli != null)
- {
- var parentData = parentli.ParentNode.SelectNodes("li");
- #endregion
- if (parentData != null)
- {
- var _dict = new Dictionary<string, string>();
- result = SetData(result, parentData, p.Id, item.Key, out _dict, "", item.Value);//第一层分组读取
- foreach (HtmlNode _item in parentData)
- {
- var _childict = new Dictionary<string, string>();
- //去除附加赛(附加赛没有子分组)
- if (!_item.InnerText.Contains("附加赛"))
- {
- Task.Run(() =>
- {
- var _url = item.Value + "?seasonType=" + (parentData.IndexOf(_item) + 1);
- if (item.Value.Last() == '/' || item.Value.Last() == '\\')
- _url = item.Value.Substring(0, item.Value.Length - 1) + "?seasonType=" + (parentData.IndexOf(_item) + 1);
- doc = CommonHelper.GetHtml(_url, "足彩");
- var _league = doc.DocumentNode.SelectSingleNode("//div[@class='league_right']");
- if (_league != null)
- {
- var league = _league.SelectSingleNode("//*[@class='league_right_021']");
- if (league != null)
- {
- var childdata = league.SelectNodes(".//em");
- if (childdata != null && childdata.Count > 0)
- {
- result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
- }
- }
- league = _league.SelectSingleNode("//*[@Id='tabs9_main_']" + (parentData.IndexOf(_item) + 1));
- if (league != null)
- {
- var childdata = league.SelectSingleNode(".//li[@class='current']").ParentNode.SelectNodes(".//a");
- if (childdata != null && childdata.Count > 0)
- {
- result = SetData(result, childdata, p.Id, item.Key, out _childict, _dict[_item.InnerText], item.Value);
- }
- }
- }
- });
- }
- }
- }
- }
- }
- }
- });
- }
- }
- }
- });
- });
- while (true)
- {
- if (CommonHelper.ThreadsFinsh())
- break;
- }
- services.SqlBulkCopyAdd<F_Grouping>(result);
- GetALL(mainUrl, pageIndex + 1);
- }
- /// <summary>
- /// 保存数据
- /// </summary>
- /// <param name="list"></param>
- /// <param name="data"></param>
- /// <param name="eventId"></param>
- /// <param name="season"></param>
- /// <param name="parentDict"></param>
- /// <param name="parentId"></param>
- /// <param name="remark"></param>
- /// <returns></returns>
- private List<F_Grouping> SetData(List<F_Grouping> list, HtmlNodeCollection data, string eventId, string season, out Dictionary<string, string> parentDict, string parentId = "", string remark = "")
- {
- parentDict = new Dictionary<string, string>();
- foreach (HtmlNode item in data)
- {
- if (item.InnerText.Trim() == ("全部"))
- continue;
- var model = new F_Grouping
- {
- Id = CommonHelper.GetGuid().ToString(),
- Name = item.InnerText.Trim(),
- ParentId = parentId ?? null,
- EventId = eventId,
- Season = season,
- Remark = remark,
- CreateDateTime = DateTime.Now
- };
- if (parentDict.ContainsKey(item.InnerText))
- continue;
- parentDict.Add(item.InnerText, model.Id);
- list.Add(model);
- }
- return list;
- }
- }
- }
|