using FCS.Common; using FCS.Crawler.Tools; using FCS.Interface; using FCS.Models; using HtmlAgilityPack; using Newtonsoft.Json; using Quartz; using System; using System.Collections.Generic; using System.Data; using System.Linq; using System.Text; using System.Text.RegularExpressions; namespace FCS.Crawler.ZCLotteryNews { public class YG_FootBallNewsJob : CommonJob, IJob { public YG_FootBallNewsJob() { log = new LogHelper(); services = IOC.Resolve(); } public void Execute(IJobExecutionContext context) { Config = CommonHelper.GetConfigFromDataMap(context.JobDetail.JobDataMap); GetAll(); } /// /// 执行主站技巧 /// public void GetAll() { List urls = new List { "http://sports.163.com/acl/" }; foreach (string url in urls) { //获取新闻列表, List yc_news = GetOpenListFromMainUrl(url); foreach (var newItem in yc_news) { services.AddNews(currentNews, newItem); } } } /// /// 爬取网易的新闻列表 /// /// /// private List GetOpenListFromMainUrl(string mainUrl) { var result = new List(); try { var url = new Uri(mainUrl); var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("gbk")); if (htmlResource == null) return result; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(htmlResource); //遍历div下的a标签 HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes("//*[@class='news_item']/h3/a"); if (nodeList == null) return result; List urls = new List(); //遍历a标签 foreach (HtmlNode node in nodeList) { HtmlAttribute attr = node.Attributes.SingleOrDefault(a => a.Name.Equals("href")); if (attr != null) { string href = attr.Value; //去重 if (!urls.Contains(href)) { urls.Add(href); } } } //爬取新闻主题 foreach (var url1 in urls) { var YCNews = GetNewsModel(url1); if (YCNews.FullHead != null && YCNews.FullHead != "") { result.Add(YCNews); } } } catch (Exception ex) { log.Error(GetType(), string.Format("【{0}】通过主抓取亚冠新闻时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message)); } return result; } /// /// 获取新闻的主题内容 /// /// /// private Base_News GetNewsModel(string url) { Base_News YCNew = new Base_News(); try { var htmlResource = NetHelper.GetUrlResponse(url, Encoding.GetEncoding("gb2312")); if (htmlResource == null) return YCNew; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(htmlResource); var div = doc.DocumentNode.SelectSingleNode("//*[@class='post_content_main']"); if (div == null) return YCNew; var Title = div.ChildNodes.Where(node => node.Name == "h1").ToList(); var divContent = doc.DocumentNode.SelectSingleNode("//*[@class='post_text']"); if (divContent==null) return YCNew; string NewContent = divContent.InnerHtml.Trim(); if (NewContent == "") { NewContent = null; } var timeDiv = doc.DocumentNode.SelectSingleNode("//*[@class='post_time_source']"); YCNew.ReleaseTime = timeDiv.FirstChild.InnerText.Replace("来源:", "").Replace("网易体育", "").Replace("\n", ""); HtmlDocument imgdoc = new HtmlDocument(); imgdoc.LoadHtml(NewContent); var img = imgdoc.DocumentNode.SelectSingleNode("//*[@class='f_center']/img"); if (img != null) { var imgsrc = img.Attributes.SingleOrDefault(a => a.Name.Equals("src")); YCNew.SourceAddress = imgsrc.Value; } YCNew.Id = Guid.NewGuid().ToString(); YCNew.FullHead = Title[0].InnerText.Trim(); YCNew.AuthorName = "zc55128"; YCNew.NewsContent = NoHTML(NewContent); YCNew.SourceName = "网易体育 亚冠"; YCNew.TypeId = (int)NewsTypeEnum.亚冠; var sql = string.Format(GetLotterySqlByTableName, "Base_DataItemDetail", currentNews.GetEnumDescription()); var res = SqlHelper.ExecuteDataset(CommandType.Text, sql); if (res != null && res.Tables.Count > 0 && res.Tables[0].Rows.Count > 0) { YCNew.CategoryId = res.Tables[0].Rows[0]["Id"].ToString(); } YCNew.Category = currentNews.GetEnumDescription(); YCNew.CreateDate = DateTime.Now; } catch (Exception ex) { log.Error(GetType(), string.Format("【{0}】通过主抓取亚冠新闻时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message)); } return YCNew; } /// /// 组装主站爬取地址 /// /// /// private List GetMainUrl(FCSConfig config) { List urlList = new List(); string url = config.MainUrl; int pages = config.MainUrlPages > 0 ? config.MainUrlPages : 1; for (int i = 1; i <= pages; i++) { string res; if (i == 1) { res = "http://sports.163.com/acl/"; } else { res = string.Format(url, i); } if (!urlList.Contains(res)) { urlList.Add(res); } } return urlList; } public static string NoHTML(string html) //去除HTML标记 { Regex regex1 = new Regex(@"", RegexOptions.IgnoreCase); Regex regex2 = new Regex(@" href *= *[sS]*script *:", RegexOptions.IgnoreCase); Regex regex3 = new Regex(@" no[sS]*=", RegexOptions.IgnoreCase); Regex regex4 = new Regex(@"", RegexOptions.IgnoreCase); Regex regex5 = new Regex(@"", RegexOptions.IgnoreCase); Regex regex6 = new Regex(@"]+>", RegexOptions.IgnoreCase); //Regex regex7 = // new Regex(@"

", // RegexOptions.IgnoreCase); //Regex regex8 = // new Regex(@"

", //RegexOptions.IgnoreCase); Regex regex9 = new Regex(@"<[^>]*>", RegexOptions.IgnoreCase); html = regex1.Replace(html, ""); //过滤标记 html = regex2.Replace(html, ""); //过滤href=javascript: () 属性 html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件 html = regex4.Replace(html, ""); //过滤iframe html = regex5.Replace(html, ""); //过滤frameset html = regex6.Replace(html, ""); //过滤frameset html = regex9.Replace(html, ""); html = Regex.Replace(html, "[\f\n\r\t\v]", ""); //过滤回车换行制表符 int index = html.IndexOf("本文来源");//删除文本来源及责任编辑 if (index != -1) { html = html.Substring(0, index - 1); } html = html.Replace("网易体育", "彩吧足球"); return html; } #region 初始化信息 ///

/// 枚举类型 /// private NewsTypeEnum currentNews => NewsTypeEnum.亚冠; #endregion 初始化信息 #region SQL语句 /// ///查询类别对应的id /// private static string GetLotterySqlByTableName = @"SELECT TOP 1 [ID],[ItemId],[ItemName] FROM [dbo].[{0}] where ItemCode='NewsCategory' and [ItemName]='{1}' ";//WHERE [IsChecked] = 1 AND [IsPassed] = 1 #endregion } }