using System; using System.Collections.Generic; using FCS.Common; using FCS.Crawler.Tools; using FCS.Interface; using FCS.Models; using HtmlAgilityPack; using Newtonsoft.Json; using Quartz; using System.Data; using System.Linq; using System.Text; using System.Text.RegularExpressions; namespace FCS.Crawler.ZCLotteryMatchs { /// /// 足球赛事 /// public class FootballMatchJob : CommonJob, IJob { private static List f_Areas = new List();//地区列表 public FootballMatchJob() { log = new LogHelper(); services = IOC.Resolve(); } public void Execute(IJobExecutionContext context) { Config = CommonHelper.GetConfigFromDataMap(context.JobDetail.JobDataMap); GetAll(); } /// /// 执行主站技巧 /// public void GetAll() { List urls = new List { "http://saishi.zgzcw.com/soccer/" }; foreach (string url in urls) { //爬取区域国家下的赛事 GetCOntryMatch(url); //爬取区域的洲赛事 GetAreaMatch(url); } } /// /// 获取区域下的州赛赛事,欧冠一类的 /// /// private void GetAreaMatch(string mainUrl) { var url = new Uri(mainUrl); var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("utf-8")); if (htmlResource == null) return; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(htmlResource); //获取枚举 var enumData = services.GetDataItem(DataItemDetailEnum.FootBallMatchType); //区域集合 HtmlNodeCollection areaHtml = doc.DocumentNode.SelectNodes("//*[@class='mbcon mbr fl']"); if (areaHtml == null) return; foreach (var areaItem in areaHtml) { if (areaItem.InnerText == null || areaItem.InnerText == "" || areaItem.InnerText.Contains("世界排名")) { continue; } HtmlDocument doc1 = new HtmlDocument(); doc1.LoadHtml(areaItem.InnerHtml); //获取区域 var area = doc1.DocumentNode.SelectNodes("//span")[0].InnerText.Trim(); HtmlNodeCollection matchHtml = doc1.DocumentNode.SelectNodes("//div/a"); if (matchHtml == null) continue; foreach (var matchitem in matchHtml) { HtmlDocument imgDoc = new HtmlDocument(); imgDoc.LoadHtml(matchitem.InnerHtml); var imgHtml = imgDoc.DocumentNode.SelectNodes("//div/img").FirstOrDefault(); HtmlAttribute img = imgHtml.Attributes.SingleOrDefault(a => a.Name.Equals("src")); F_Events events = new F_Events(); events.Id = Guid.NewGuid().ToString(); events.Name = matchitem.InnerText.Trim(); events.AreaId = f_Areas.Where(o => o.Name == area).Count() != 0 ? f_Areas.Where(o => o.Name == area).FirstOrDefault().Id : ""; events.LogoImage = img.Value; switch (area) { case "欧洲赛事": events.EventType = enumData.Where(o => o.Name == "欧洲赛事").Select(o => o.Id).ToList()[0].ToString(); break; case "亚洲赛事": events.EventType = enumData.Where(o => o.Name == "亚洲赛事").Select(o => o.Id).ToList()[0].ToString(); break; case "美洲赛事": events.EventType = enumData.Where(o => o.Name == "美洲赛事").Select(o => o.Id).ToList()[0].ToString(); break; case "非洲赛事": events.EventType = enumData.Where(o => o.Name == "非洲赛事").Select(o => o.Id).ToList()[0].ToString(); break; case "澳洲赛事": events.EventType = enumData.Where(o => o.Name == "大洋洲赛事").Select(o => o.Id).ToList()[0].ToString(); break; } events.Remark = matchitem.Attributes.SingleOrDefault(a => a.Name.Equals("href")).Value; services.AddEvents(FCSLottery.F_Events, events); } } } /// /// 获取区域赛事 /// /// /// private void GetCOntryMatch(string mainUrl) { try { var url = new Uri(mainUrl); var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("utf-8")); if (htmlResource == null) return; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(htmlResource); //获取枚举 var enumData = services.GetDataItem(DataItemDetailEnum.FootBallMatchType); //区域集合 HtmlNodeCollection areaHtml = doc.DocumentNode.SelectNodes("//*[@class='mbcon mbl fl']"); if (areaHtml == null) return; foreach (var areaItem in areaHtml) { if (areaItem.InnerText == null || areaItem.InnerText == "") { continue; } HtmlDocument doc1 = new HtmlDocument(); doc1.LoadHtml(areaItem.InnerHtml); if (doc1.DocumentNode.SelectNodes("//span") == null || doc1.DocumentNode.SelectNodes("//span").Count == 0) { continue; } //获取区域 var area = doc1.DocumentNode.SelectNodes("//span")[0].InnerText.Trim(); // 插入区域到区域表中 F_Area areaData = new F_Area(); areaData.Id = Guid.NewGuid().ToString(); areaData.Name = area; services.AddArea(FCSLottery.F_Area, areaData); f_Areas.Add(areaData); //国家集合 HtmlNodeCollection CountryHtml = doc1.DocumentNode.SelectNodes("//*[@class='ls']/div"); if (CountryHtml == null) { continue; } foreach (var countryItem in CountryHtml) { HtmlDocument countryDoc = new HtmlDocument(); countryDoc.LoadHtml(countryItem.InnerHtml); if (countryItem.InnerHtml == null || countryItem.InnerHtml == "") { continue; } //获取国家 if (countryDoc.DocumentNode.SelectNodes("//div") == null || countryDoc.DocumentNode.SelectNodes("//div").Count == 0) { continue; } var country = countryDoc.DocumentNode.SelectNodes("//div")[0].InnerText.Trim(); //TODO 把国家信息插入数据库 F_Area countryData = new F_Area(); countryData.Id = Guid.NewGuid().ToString(); countryData.Name = country; countryData.ParentId = areaData.Id; if (areaData.Name == "杯赛赛事") { F_Events events = new F_Events(); events.Id = Guid.NewGuid().ToString(); events.Name = country; events.AreaId = areaData.Id; events.EventType = enumData.Where(o => o.Name == "国际赛事").Select(o => o.Id).ToList()[0].ToString(); HtmlNodeCollection href = countryDoc.DocumentNode.SelectNodes("//a"); events.Remark = href == null ? "" : href[0].Attributes.SingleOrDefault(a => a.Name.Equals("href")).Value; HtmlNodeCollection img = countryDoc.DocumentNode.SelectNodes("//a/img"); events.LogoImage = img == null ? "" : img[0].Attributes.SingleOrDefault(a => a.Name.Equals("src")).Value; services.AddEvents(FCSLottery.F_Events, events); } else { services.AddArea(FCSLottery.F_Area, countryData); //赛事集合 HtmlNodeCollection matchHtml = countryDoc.DocumentNode.SelectNodes("//*[@class='kuang']/a"); if (matchHtml == null) { continue; } foreach (var matchItem in matchHtml) { if (matchItem.InnerHtml == null || matchItem.InnerHtml == "") { continue; } HtmlDocument imgDoc = new HtmlDocument(); imgDoc.LoadHtml(matchItem.InnerHtml); var imgHtml = imgDoc.DocumentNode.SelectNodes("//div/img").FirstOrDefault(); HtmlAttribute img = imgHtml.Attributes.SingleOrDefault(a => a.Name.Equals("src")); HtmlAttribute attr = matchItem.Attributes.SingleOrDefault(a => a.Name.Equals("href")); //获取赛事名称以及url F_Events events = new F_Events(); events.Id = Guid.NewGuid().ToString(); events.Name = matchItem.InnerText.Trim(); events.AreaId = countryData.Id; events.Remark = attr.Value; events.LogoImage = img.Value; switch (areaData.Name) { case "欧洲赛事": events.EventType = enumData.Where(o => o.Name == "欧洲赛事").Select(o => o.Id).ToList()[0].ToString(); break; case "亚洲赛事": events.EventType = enumData.Where(o => o.Name == "亚洲赛事").Select(o => o.Id).ToList()[0].ToString(); break; case "美洲赛事": events.EventType = enumData.Where(o => o.Name == "美洲赛事").Select(o => o.Id).ToList()[0].ToString(); break; case "非洲赛事": events.EventType = enumData.Where(o => o.Name == "非洲赛事").Select(o => o.Id).ToList()[0].ToString(); break; case "澳洲赛事": events.EventType = enumData.Where(o => o.Name == "大洋洲赛事").Select(o => o.Id).ToList()[0].ToString(); break; //case "杯赛赛事": // events.EventType = enumData.Where(o => o.Name == "国际赛事").Select(o => o.Id).ToList()[0].ToString(); // break; } services.AddEvents(FCSLottery.F_Events, events); } } } } } catch (Exception ex) { log.Error(GetType(), string.Format("【{0}】通过主抓取篮球分析时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message)); } return; } /// /// 组装主站爬取地址 /// /// /// private List GetMainUrl(FCSConfig config) { List urlList = new List(); string url = config.MainUrl; int pages = config.MainUrlPages > 0 ? config.MainUrlPages : 1; for (int i = 1; i <= pages; i++) { string res; if (i == 1) { res = "http://saishi.zgzcw.com/soccer"; } else { res = string.Format(url, i); } if (!urlList.Contains(res)) { urlList.Add(res); } } return urlList; } public static string NoHTML(string html) //去除HTML标记 { Regex regex1 = new Regex(@"", RegexOptions.IgnoreCase); Regex regex2 = new Regex(@" href *= *[sS]*script *:", RegexOptions.IgnoreCase); Regex regex3 = new Regex(@" no[sS]*=", RegexOptions.IgnoreCase); Regex regex4 = new Regex(@"", RegexOptions.IgnoreCase); Regex regex5 = new Regex(@"", RegexOptions.IgnoreCase); Regex regex6 = new Regex(@"]+>", RegexOptions.IgnoreCase); //Regex regex7 = // new Regex(@"

", // RegexOptions.IgnoreCase); //Regex regex8 = // new Regex(@"

", //RegexOptions.IgnoreCase); Regex regex9 = new Regex(@"<[^>]*>", RegexOptions.IgnoreCase); html = regex1.Replace(html, ""); //过滤标记 html = regex2.Replace(html, ""); //过滤href=javascript: () 属性 html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件 html = regex4.Replace(html, ""); //过滤iframe html = regex5.Replace(html, ""); //过滤frameset html = regex6.Replace(html, ""); //过滤frameset html = regex9.Replace(html, ""); html = Regex.Replace(html, "[\f\n\r\t\v]", ""); //过滤回车换行制表符 int index = html.IndexOf("本文来源");//删除文本来源及责任编辑 if (index != -1) { html = html.Substring(0, index - 1); } html.Replace("网易体育", "彩吧足球"); return html; } #region 初始化信息 ///

/// 枚举类型 /// private NewsTypeEnum currentNews => NewsTypeEnum.篮球分析; #endregion 初始化信息 #region SQL语句 /// ///查询类别对应的id /// private static string GetLotterySqlByTableName = @"SELECT TOP 1 [ID],[ItemId],[ItemName] FROM [dbo].[{0}] where [ItemName]='{1}' ";//WHERE [IsChecked] = 1 AND [IsPassed] = 1 #endregion } }