123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367 |
- using System;
- using System.Collections.Generic;
- using FCS.Common;
- using FCS.Crawler.Tools;
- using FCS.Interface;
- using FCS.Models;
- using HtmlAgilityPack;
- using Newtonsoft.Json;
- using Quartz;
- using System.Data;
- using System.Linq;
- using System.Text;
- using System.Text.RegularExpressions;
- namespace FCS.Crawler.ZCLotteryMatchs
- {
- /// <summary>
- /// 足球赛事
- /// </summary>
- public class FootballMatchJob : CommonJob, IJob
- {
- private static List<F_Area> f_Areas = new List<F_Area>();//地区列表
- public FootballMatchJob()
- {
- log = new LogHelper();
- services = IOC.Resolve<IDTOpenCode>();
- }
- public void Execute(IJobExecutionContext context)
- {
- Config = CommonHelper.GetConfigFromDataMap(context.JobDetail.JobDataMap);
- GetAll();
- }
- /// <summary>
- /// 执行主站技巧
- /// </summary>
- public void GetAll()
- {
- List<string> urls = new List<string> { "http://saishi.zgzcw.com/soccer/" };
- foreach (string url in urls)
- {
- //爬取区域国家下的赛事
- GetCOntryMatch(url);
- //爬取区域的洲赛事
- GetAreaMatch(url);
- }
- }
- /// <summary>
- /// 获取区域下的州赛赛事,欧冠一类的
- /// </summary>
- /// <param name="mainUrl"></param>
- private void GetAreaMatch(string mainUrl)
- {
- var url = new Uri(mainUrl);
- var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("utf-8"));
- if (htmlResource == null) return;
- HtmlDocument doc = new HtmlDocument();
- doc.LoadHtml(htmlResource);
- //获取枚举
- var enumData = services.GetDataItem(DataItemDetailEnum.FootBallMatchType);
- //区域集合
- HtmlNodeCollection areaHtml = doc.DocumentNode.SelectNodes("//*[@class='mbcon mbr fl']");
- if (areaHtml == null) return;
- foreach (var areaItem in areaHtml)
- {
- if (areaItem.InnerText == null || areaItem.InnerText == "" || areaItem.InnerText.Contains("世界排名"))
- {
- continue;
- }
- HtmlDocument doc1 = new HtmlDocument();
- doc1.LoadHtml(areaItem.InnerHtml);
- //获取区域
- var area = doc1.DocumentNode.SelectNodes("//span")[0].InnerText.Trim();
- HtmlNodeCollection matchHtml = doc1.DocumentNode.SelectNodes("//div/a");
- if (matchHtml == null) continue;
- foreach (var matchitem in matchHtml)
- {
- HtmlDocument imgDoc = new HtmlDocument();
- imgDoc.LoadHtml(matchitem.InnerHtml);
- var imgHtml = imgDoc.DocumentNode.SelectNodes("//div/img").FirstOrDefault();
- HtmlAttribute img = imgHtml.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
- F_Events events = new F_Events();
- events.Id = Guid.NewGuid().ToString();
- events.Name = matchitem.InnerText.Trim();
- events.AreaId = f_Areas.Where(o => o.Name == area).Count() != 0 ? f_Areas.Where(o => o.Name == area).FirstOrDefault().Id : "";
- events.LogoImage = img.Value;
- switch (area)
- {
- case "欧洲赛事":
- events.EventType = enumData.Where(o => o.Name == "欧洲赛事").Select(o => o.Id).ToList()[0].ToString();
- break;
- case "亚洲赛事":
- events.EventType = enumData.Where(o => o.Name == "亚洲赛事").Select(o => o.Id).ToList()[0].ToString();
- break;
- case "美洲赛事":
- events.EventType = enumData.Where(o => o.Name == "美洲赛事").Select(o => o.Id).ToList()[0].ToString();
- break;
- case "非洲赛事":
- events.EventType = enumData.Where(o => o.Name == "非洲赛事").Select(o => o.Id).ToList()[0].ToString();
- break;
- case "澳洲赛事":
- events.EventType = enumData.Where(o => o.Name == "大洋洲赛事").Select(o => o.Id).ToList()[0].ToString();
- break;
- }
- events.Remark = matchitem.Attributes.SingleOrDefault(a => a.Name.Equals("href")).Value;
- services.AddEvents(FCSLottery.F_Events, events);
- }
- }
- }
- /// <summary>
- /// 获取区域赛事
- /// </summary>
- /// <param name="mainUrl"></param>
- /// <returns></returns>
- private void GetCOntryMatch(string mainUrl)
- {
- try
- {
- var url = new Uri(mainUrl);
- var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("utf-8"));
- if (htmlResource == null) return;
- HtmlDocument doc = new HtmlDocument();
- doc.LoadHtml(htmlResource);
- //获取枚举
- var enumData = services.GetDataItem(DataItemDetailEnum.FootBallMatchType);
- //区域集合
- HtmlNodeCollection areaHtml = doc.DocumentNode.SelectNodes("//*[@class='mbcon mbl fl']");
- if (areaHtml == null) return;
- foreach (var areaItem in areaHtml)
- {
- if (areaItem.InnerText == null || areaItem.InnerText == "")
- {
- continue;
- }
- HtmlDocument doc1 = new HtmlDocument();
- doc1.LoadHtml(areaItem.InnerHtml);
- if (doc1.DocumentNode.SelectNodes("//span") == null || doc1.DocumentNode.SelectNodes("//span").Count == 0)
- {
- continue;
- }
- //获取区域
- var area = doc1.DocumentNode.SelectNodes("//span")[0].InnerText.Trim();
- // 插入区域到区域表中
- F_Area areaData = new F_Area();
- areaData.Id = Guid.NewGuid().ToString();
- areaData.Name = area;
- services.AddArea(FCSLottery.F_Area, areaData);
- f_Areas.Add(areaData);
- //国家集合
- HtmlNodeCollection CountryHtml = doc1.DocumentNode.SelectNodes("//*[@class='ls']/div");
- if (CountryHtml == null)
- {
- continue;
- }
- foreach (var countryItem in CountryHtml)
- {
- HtmlDocument countryDoc = new HtmlDocument();
- countryDoc.LoadHtml(countryItem.InnerHtml);
- if (countryItem.InnerHtml == null || countryItem.InnerHtml == "")
- {
- continue;
- }
- //获取国家
- if (countryDoc.DocumentNode.SelectNodes("//div") == null || countryDoc.DocumentNode.SelectNodes("//div").Count == 0)
- {
- continue;
- }
- var country = countryDoc.DocumentNode.SelectNodes("//div")[0].InnerText.Trim();
- //TODO 把国家信息插入数据库
- F_Area countryData = new F_Area();
- countryData.Id = Guid.NewGuid().ToString();
- countryData.Name = country;
- countryData.ParentId = areaData.Id;
- if (areaData.Name == "杯赛赛事")
- {
- F_Events events = new F_Events();
- events.Id = Guid.NewGuid().ToString();
- events.Name = country;
- events.AreaId = areaData.Id;
- events.EventType = enumData.Where(o => o.Name == "国际赛事").Select(o => o.Id).ToList()[0].ToString();
- HtmlNodeCollection href = countryDoc.DocumentNode.SelectNodes("//a");
- events.Remark = href == null ? "" : href[0].Attributes.SingleOrDefault(a => a.Name.Equals("href")).Value;
- HtmlNodeCollection img = countryDoc.DocumentNode.SelectNodes("//a/img");
- events.LogoImage = img == null ? "" : img[0].Attributes.SingleOrDefault(a => a.Name.Equals("src")).Value;
- services.AddEvents(FCSLottery.F_Events, events);
- }
- else
- {
- services.AddArea(FCSLottery.F_Area, countryData);
- //赛事集合
- HtmlNodeCollection matchHtml = countryDoc.DocumentNode.SelectNodes("//*[@class='kuang']/a");
- if (matchHtml == null)
- {
- continue;
- }
- foreach (var matchItem in matchHtml)
- {
- if (matchItem.InnerHtml == null || matchItem.InnerHtml == "")
- {
- continue;
- }
- HtmlDocument imgDoc = new HtmlDocument();
- imgDoc.LoadHtml(matchItem.InnerHtml);
- var imgHtml = imgDoc.DocumentNode.SelectNodes("//div/img").FirstOrDefault();
- HtmlAttribute img = imgHtml.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
- HtmlAttribute attr = matchItem.Attributes.SingleOrDefault(a => a.Name.Equals("href"));
- //获取赛事名称以及url
- F_Events events = new F_Events();
- events.Id = Guid.NewGuid().ToString();
- events.Name = matchItem.InnerText.Trim();
- events.AreaId = countryData.Id;
- events.Remark = attr.Value;
- events.LogoImage = img.Value;
- switch (areaData.Name)
- {
- case "欧洲赛事":
- events.EventType = enumData.Where(o => o.Name == "欧洲赛事").Select(o => o.Id).ToList()[0].ToString();
- break;
- case "亚洲赛事":
- events.EventType = enumData.Where(o => o.Name == "亚洲赛事").Select(o => o.Id).ToList()[0].ToString();
- break;
- case "美洲赛事":
- events.EventType = enumData.Where(o => o.Name == "美洲赛事").Select(o => o.Id).ToList()[0].ToString();
- break;
- case "非洲赛事":
- events.EventType = enumData.Where(o => o.Name == "非洲赛事").Select(o => o.Id).ToList()[0].ToString();
- break;
- case "澳洲赛事":
- events.EventType = enumData.Where(o => o.Name == "大洋洲赛事").Select(o => o.Id).ToList()[0].ToString();
- break;
- //case "杯赛赛事":
- // events.EventType = enumData.Where(o => o.Name == "国际赛事").Select(o => o.Id).ToList()[0].ToString();
- // break;
- }
- services.AddEvents(FCSLottery.F_Events, events);
- }
- }
- }
- }
- }
- catch (Exception ex)
- {
- log.Error(GetType(),
- string.Format("【{0}】通过主抓取篮球分析时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
- }
- return;
- }
- /// <summary>
- /// 组装主站爬取地址
- /// </summary>
- /// <param name="config"></param>
- /// <returns></returns>
- private List<string> GetMainUrl(FCSConfig config)
- {
- List<string> urlList = new List<string>();
- string url = config.MainUrl;
- int pages = config.MainUrlPages > 0 ? config.MainUrlPages : 1;
- for (int i = 1; i <= pages; i++)
- {
- string res;
- if (i == 1)
- {
- res = "http://saishi.zgzcw.com/soccer";
- }
- else
- {
- res = string.Format(url, i);
- }
- if (!urlList.Contains(res))
- {
- urlList.Add(res);
- }
- }
- return urlList;
- }
- public static string NoHTML(string html) //去除HTML标记
- {
- Regex regex1 =
- new Regex(@"<script[sS]+</script *>",
- RegexOptions.IgnoreCase);
- Regex regex2 =
- new Regex(@" href *= *[sS]*script *:",
- RegexOptions.IgnoreCase);
- Regex regex3 =
- new Regex(@" no[sS]*=",
- RegexOptions.IgnoreCase);
- Regex regex4 =
- new Regex(@"<iframe[sS]+</iframe *>",
- RegexOptions.IgnoreCase);
- Regex regex5 =
- new Regex(@"<frameset[sS]+</frameset *>",
- RegexOptions.IgnoreCase);
- Regex regex6 =
- new Regex(@"<img[^>]+>",
- RegexOptions.IgnoreCase);
- //Regex regex7 =
- // new Regex(@"</p>",
- // RegexOptions.IgnoreCase);
- //Regex regex8 =
- // new Regex(@"<p>",
- //RegexOptions.IgnoreCase);
- Regex regex9 =
- new Regex(@"<[^>]*>",
- RegexOptions.IgnoreCase);
- html = regex1.Replace(html, ""); //过滤<script></script>标记
- html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
- html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
- html = regex4.Replace(html, ""); //过滤iframe
- html = regex5.Replace(html, ""); //过滤frameset
- html = regex6.Replace(html, ""); //过滤frameset
- html = regex9.Replace(html, "");
- html = Regex.Replace(html, "[\f\n\r\t\v]", ""); //过滤回车换行制表符
- int index = html.IndexOf("本文来源");//删除文本来源及责任编辑
- if (index != -1)
- {
- html = html.Substring(0, index - 1);
- }
- html.Replace("网易体育", "彩吧足球");
- return html;
- }
- #region 初始化信息
- /// <summary>
- /// 枚举类型
- /// </summary>
- private NewsTypeEnum currentNews => NewsTypeEnum.篮球分析;
- #endregion 初始化信息
- #region SQL语句
- /// <summary>
- ///查询类别对应的id
- /// </summary>
- private static string GetLotterySqlByTableName = @"SELECT TOP 1 [ID],[ItemId],[ItemName] FROM [dbo].[{0}] where [ItemName]='{1}' ";//WHERE [IsChecked] = 1 AND [IsPassed] = 1
- #endregion
- }
- }
|