123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262 |
- using FCS.Common;
- using FCS.Crawler.Tools;
- using FCS.Interface;
- using FCS.Models;
- using HtmlAgilityPack;
- using Newtonsoft.Json;
- using Quartz;
- using System;
- using System.Collections.Generic;
- using System.Data;
- using System.Linq;
- using System.Text;
- using System.Text.RegularExpressions;
- namespace FCS.Crawler.ZCLotteryNews
- {
- public class ZC_FootBallNewsJob : CommonJob, IJob
- {
- public ZC_FootBallNewsJob()
- {
- log = new LogHelper();
- services = IOC.Resolve<IDTOpenCode>();
- }
- public void Execute(IJobExecutionContext context)
- {
- Config = CommonHelper.GetConfigFromDataMap(context.JobDetail.JobDataMap);
- GetAll();
- }
- /// <summary>
- /// 执行主站技巧
- /// </summary>
- public void GetAll()
- {
- List<string> urls = new List<string> { "http://sports.163.com/zc/" };
- foreach (string url in urls)
- {
- //获取新闻列表,
- List<Base_News> yc_news = GetOpenListFromMainUrl(url);
- foreach (var newItem in yc_news)
- {
- services.AddNews(currentNews, newItem);
- }
- }
- }
- /// <summary>
- /// 爬取网易的新闻列表
- /// </summary>
- /// <param name="mainUrl"></param>
- /// <returns></returns>
- private List<Base_News> GetOpenListFromMainUrl(string mainUrl)
- {
- var result = new List<Base_News>();
- try
- {
- var url = new Uri(mainUrl);
- var htmlResource = NetHelper.GetUrlResponse(mainUrl, Encoding.GetEncoding("gbk"));
- if (htmlResource == null) return result;
- HtmlDocument doc = new HtmlDocument();
- doc.LoadHtml(htmlResource);
- //遍历div下的a标签
- HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes("//*[@class='news_item']/h3/a");
- if (nodeList == null) return result;
- List<string> urls = new List<string>();
- //遍历a标签
- foreach (HtmlNode node in nodeList)
- {
- HtmlAttribute attr = node.Attributes.SingleOrDefault(a => a.Name.Equals("href"));
- if (attr != null)
- {
- string href = attr.Value;
- //去重
- if (!urls.Contains(href))
- {
- urls.Add(href);
- }
- }
- }
- //爬取新闻主题
- foreach (var url1 in urls)
- {
- var YCNews = GetNewsModel(url1);
- if (YCNews.FullHead != null && YCNews.FullHead != "")
- {
- result.Add(YCNews);
- }
- }
- }
- catch (Exception ex)
- {
- log.Error(GetType(),
- string.Format("【{0}】通过主抓取中超新闻时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
- }
- return result;
- }
- /// <summary>
- /// 获取新闻的主题内容
- /// </summary>
- /// <param name="url"></param>
- /// <returns></returns>
- private Base_News GetNewsModel(string url)
- {
- Base_News YCNew = new Base_News();
- try
- {
- var htmlResource = NetHelper.GetUrlResponse(url, Encoding.GetEncoding("gb2312"));
- if (htmlResource == null) return YCNew;
- HtmlDocument doc = new HtmlDocument();
- doc.LoadHtml(htmlResource);
- var div = doc.DocumentNode.SelectSingleNode("//*[@class='post_content_main']");
- if (div == null) return YCNew;
- var Title = div.ChildNodes.Where(node => node.Name == "h1").ToList();
- var divContent = doc.DocumentNode.SelectSingleNode("//*[@class='post_text']");
- if (divContent == null) return YCNew;
- string NewContent = divContent.InnerHtml.Trim();
- if (NewContent == "")
- {
- NewContent = null;
- }
- var timeDiv = doc.DocumentNode.SelectSingleNode("//*[@class='post_time_source']");
- YCNew.ReleaseTime = timeDiv.FirstChild.InnerText.Replace("来源:", "").Replace("网易体育", "").Replace("\n", "");
- HtmlDocument imgdoc = new HtmlDocument();
- imgdoc.LoadHtml(NewContent);
- var img = imgdoc.DocumentNode.SelectSingleNode("//*[@class='f_center']/img");
- if (img != null)
- {
- var imgsrc = img.Attributes.SingleOrDefault(a => a.Name.Equals("src"));
- YCNew.SourceAddress = imgsrc.Value;
- }
- YCNew.Id = Guid.NewGuid().ToString();
- YCNew.FullHead = Title[0].InnerText.Trim();
- YCNew.AuthorName = "zc55128";
- YCNew.NewsContent = NoHTML(NewContent);
- YCNew.SourceName = "网易体育 中超";
- YCNew.TypeId = (int)NewsTypeEnum.中超;
- var sql = string.Format(GetLotterySqlByTableName, "Base_DataItemDetail", currentNews.GetEnumDescription());
- var res = SqlHelper.ExecuteDataset(CommandType.Text, sql);
- if (res != null && res.Tables.Count > 0 && res.Tables[0].Rows.Count > 0)
- {
- YCNew.CategoryId = res.Tables[0].Rows[0]["Id"].ToString();
- }
- YCNew.Category = currentNews.GetEnumDescription();
- YCNew.CreateDate = DateTime.Now;
- }
- catch (Exception ex)
- {
- log.Error(GetType(),
- string.Format("【{0}】通过主抓取中超新闻时发生错误,错误信息【{1}】", Config.Area + currentNews, ex.Message));
- }
- return YCNew;
- }
- /// <summary>
- /// 组装主站爬取地址
- /// </summary>
- /// <param name="config"></param>
- /// <returns></returns>
- private List<string> GetMainUrl(FCSConfig config)
- {
- List<string> urlList = new List<string>();
- string url = config.MainUrl;
- int pages = config.MainUrlPages > 0 ? config.MainUrlPages : 1;
- for (int i = 1; i <= pages; i++)
- {
- string res;
- if (i == 1)
- {
- res = "http://sports.163.com/zc/";
- }
- else
- {
- res = string.Format(url, i);
- }
- if (!urlList.Contains(res))
- {
- urlList.Add(res);
- }
- }
- return urlList;
- }
- public static string NoHTML(string html) //去除HTML标记
- {
- Regex regex1 =
- new Regex(@"<script[sS]+</script *>",
- RegexOptions.IgnoreCase);
- Regex regex2 =
- new Regex(@" href *= *[sS]*script *:",
- RegexOptions.IgnoreCase);
- Regex regex3 =
- new Regex(@" no[sS]*=",
- RegexOptions.IgnoreCase);
- Regex regex4 =
- new Regex(@"<iframe[sS]+</iframe *>",
- RegexOptions.IgnoreCase);
- Regex regex5 =
- new Regex(@"<frameset[sS]+</frameset *>",
- RegexOptions.IgnoreCase);
- Regex regex6 =
- new Regex(@"<img[^>]+>",
- RegexOptions.IgnoreCase);
- //Regex regex7 =
- // new Regex(@"</p>",
- // RegexOptions.IgnoreCase);
- //Regex regex8 =
- // new Regex(@"<p>",
- //RegexOptions.IgnoreCase);
- Regex regex9 =
- new Regex(@"<[^>]*>",
- RegexOptions.IgnoreCase);
- html = regex1.Replace(html, ""); //过滤<script></script>标记
- html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
- html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
- html = regex4.Replace(html, ""); //过滤iframe
- html = regex5.Replace(html, ""); //过滤frameset
- html = regex6.Replace(html, ""); //过滤frameset
- html = regex9.Replace(html, "");
- html = Regex.Replace(html, "[\f\n\r\t\v]", ""); //过滤回车换行制表符
- int index = html.IndexOf("本文来源");//删除文本来源及责任编辑
- if (index != -1)
- {
- html = html.Substring(0, index - 1);
- }
- html = html.Replace("网易体育", "彩吧足球");
- return html;
- }
- #region 初始化信息
-
- /// <summary>
- /// 枚举类型
- /// </summary>
- private NewsTypeEnum currentNews => NewsTypeEnum.中超;
- #endregion 初始化信息
- #region SQL语句
- /// <summary>
- ///查询类别对应的id
- /// </summary>
- private static string GetLotterySqlByTableName = @"SELECT TOP 1 [ID],[ItemId],[ItemName] FROM [dbo].[{0}] where ItemCode='NewsCategory' and [ItemName]='{1}' ";//WHERE [IsChecked] = 1 AND [IsPassed] = 1
- #endregion
- }
- }
|