using FCS.Common;
using FCS.Crawler.Tools;
using FCS.Interface;
using FCS.Models;
using HtmlAgilityPack;
using Newtonsoft.Json;
using Quartz;
using System;
using System.Collections.Generic;
using System.Data;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace FCS.Crawler.ZCLotteryNews
{
///
/// 德甲 新闻
///
public class DJ_FootBallNewsJob : CommonJob, IJob
{
public DJ_FootBallNewsJob() {
log = new LogHelper();
services = IOC.Resolve();
}
public void Execute(IJobExecutionContext context)
{
Config = CommonHelper.GetConfigFromDataMap(context.JobDetail.JobDataMap);
GetAll();
}
///
/// 执行主站技巧
///
public void GetAll()
{
List urls = new List { "http://sports.163.com/dj/" };
foreach (string url in urls)
{
//获取新闻列表,
List yc_news = GetOpenListFromMainUrl(url);
foreach (var newItem in yc_news)
{
services.AddNews(currentNews, newItem);
}
}
}
///
/// 爬取网易的新闻列表
///
///
///
private List GetOpenListFromMainUrl(string mainUrl)
{
var result = new List();
try
{
var url = new Uri(mainUrl);
var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("gbk"));
if (htmlResource == null) return result;
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlResource);
//遍历div下的a标签
HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes("//*[@class='news_item']/h3/a");
if (nodeList == null) return result;
List urls = new List();
//遍历a标签
foreach (HtmlNode node in nodeList)
{
HtmlAttribute attr = node.Attributes.SingleOrDefault(a => a.Name.Equals("href"));
if (attr != null)
{
string href = attr.Value;
//去重
if (!urls.Contains(href))
{
urls.Add(href);
}
}
}
//爬取新闻主题
foreach (var url1 in urls)
{
var YCNews = GetNewsModel(url1);
if (YCNews.FullHead!=null&&YCNews.FullHead!="")
{
result.Add(YCNews);
}
}
}
catch (Exception ex)
{
log.Error(GetType(),
string.Format("【{0}】通过主抓取德甲新闻时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
}
return result;
}
///
/// 获取新闻的主题内容
///
///
///
private Base_News GetNewsModel(string url)
{
Base_News YCNew =new Base_News ();
try
{
var htmlResource = NetHelper.GetUrlResponse(url, Encoding.GetEncoding("gb2312"));
if (htmlResource == null) return YCNew;
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlResource);
var div = doc.DocumentNode.SelectSingleNode("//*[@class='post_content_main']");
if (div == null) return YCNew;
var Title = div.ChildNodes.Where(node => node.Name == "h1").ToList();
var divContent = doc.DocumentNode.SelectSingleNode("//*[@class='post_text']");
if (divContent == null) return YCNew;
string NewContent = divContent.InnerHtml.Trim();
if (NewContent == "")
{
NewContent = null;
}
var timeDiv = doc.DocumentNode.SelectSingleNode("//*[@class='post_time_source']");
YCNew.ReleaseTime = timeDiv.FirstChild.InnerText.Replace("来源:", "").Replace("网易体育", "").Replace("\n", "");
HtmlDocument imgdoc = new HtmlDocument();
imgdoc.LoadHtml(NewContent);
var img = imgdoc.DocumentNode.SelectSingleNode("//*[@class='f_center']/img");
if (img != null)
{
var imgsrc = img.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
YCNew.SourceAddress = imgsrc.Value;
}
YCNew.Id = Guid.NewGuid().ToString();
YCNew.FullHead = Title[0].InnerText.Trim();
YCNew.AuthorName = "zc55128";
YCNew.NewsContent = NoHTML(NewContent);
YCNew.SourceName = "网易体育 德甲";
YCNew.TypeId = (int)NewsTypeEnum.德甲;
var sql = string.Format(GetLotterySqlByTableName, "Base_DataItemDetail", currentNews.GetEnumDescription());
var res = SqlHelper.ExecuteDataset(CommandType.Text, sql);
if (res != null && res.Tables.Count > 0 && res.Tables[0].Rows.Count > 0)
{
YCNew.CategoryId = res.Tables[0].Rows[0]["Id"].ToString();
}
YCNew.Category = currentNews.GetEnumDescription();
YCNew.CreateDate = DateTime.Now;
}
catch (Exception ex)
{
log.Error(GetType(),
string.Format("【{0}】通过主抓取德甲新闻时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
}
return YCNew;
}
///
/// 组装主站爬取地址
///
///
///
private List GetMainUrl(FCSConfig config)
{
List urlList = new List();
string url = config.MainUrl;
int pages = config.MainUrlPages > 0 ? config.MainUrlPages : 1;
for (int i = 1; i <= pages; i++)
{
string res;
if (i == 1)
{
res = "http://sports.163.com/dj/";
}
else
{
res = string.Format(url, i);
}
if (!urlList.Contains(res))
{
urlList.Add(res);
}
}
return urlList;
}
public static string NoHTML(string html) //去除HTML标记
{
Regex regex1 =
new Regex(@"标记
html = regex2.Replace(html, ""); //过滤href=javascript: () 属性
html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
html = regex4.Replace(html, ""); //过滤iframe
html = regex5.Replace(html, ""); //过滤frameset
html = regex6.Replace(html, ""); //过滤frameset
html = regex9.Replace(html, "");
html = Regex.Replace(html, "[\f\n\r\t\v]", ""); //过滤回车换行制表符
int index = html.IndexOf("本文来源");//删除文本来源及责任编辑
if (index != -1)
{
html = html.Substring(0, index - 1);
}
html = html.Replace("网易体育", "彩吧足球");
return html;
}
#region 初始化信息
///
/// 枚举类型
///
private NewsTypeEnum currentNews => NewsTypeEnum.德甲;
#endregion 初始化信息
#region SQL语句
///
///查询类别对应的id
///
private static string GetLotterySqlByTableName = @"SELECT TOP 1 [ID],[ItemId],[ItemName] FROM [dbo].[{0}] where ItemCode='NewsCategory' and [ItemName]='{1}' ";//WHERE [IsChecked] = 1 AND [IsPassed] = 1
#endregion
}
}