Monday, December 16, 2013

TimesOfIndiaRipper

Here's the complete code to RIP timeofIndia site..You would need HtmlAgilityPack.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml.Linq;
using System.Text;
using HtmlAgilityPack;
namespace NewsRipper
{
    class Program
    {
        static void Main(string[] args)
        {
            List<categoryFeed> cFeeds = new List<categoryFeed>();
            cFeeds.Add(new categoryFeed { category = "Headlines", source = "http://timesofindia.feedsportal.com/c/33039/f/533965/index.rss" });
            cFeeds.Add(new categoryFeed { category = "World", source = "http://timesofindia.feedsportal.com/c/33039/f/533917/index.rss" });
            cFeeds.Add(new categoryFeed { category = "Business", source = "http://timesofindia.feedsportal.com/c/33039/f/533919/index.rss" });
            cFeeds.Add(new categoryFeed { category = "Sports", source = "http://timesofindia.feedsportal.com/c/33039/f/533921/index.rss" });
            cFeeds.Add(new categoryFeed { category = "Health", source = "http://timesofindia.feedsportal.com/c/33039/f/533968/index.rss" });
            cFeeds.Add(new categoryFeed { category = "Tech", source = "http://timesofindia.feedsportal.com/c/33039/f/533923/index.rss" });
            cFeeds.Add(new categoryFeed { category = "Entertainment", source = "http://timesofindia.feedsportal.com/c/33039/f/533928/index.rss" });
            List<Feed> items = new List<Feed>();
            foreach (categoryFeed cf in cFeeds)
            {
                items.AddRange(readFeed(cf.category,cf.source));
            }
            items.ForEach(x=>parseUrl(ref x,x.Uri));
            XElement root=new XElement("Feeds");
            foreach (XElement elm in getXElementFeed(items))root.Add(elm);
            System.IO.File.WriteAllText("c:\\wow.xml",root.ToString());
        }
          static IEnumerable<XElement> getXElementFeed(List<Feed> feed)
          {
              foreach(Feed f in feed)
              yield return ((new XElement("News", new XElement("Title", f.Title), 
                  new XElement("Category", f.Category),
                  new XElement("Uri", f.Uri), 
                  new XElement("ImageUri", f.ImageUri),
                  new XElement("Description", f.Description),
                  new XElement("Time", f.Time.ToString().ToString()))));

          }
        static List<Feed> readFeed(String category, String uri)
        {
            XDocument doc = XDocument.Load(uri);
            return doc.Descendants("item").Select(x => new Feed { Title = x.Element("title").Value, Time = DateTime.Parse(x.Element("pubDate").Value), Uri = x.Element("guid").Value,Category=category }).ToList<Feed>();
        }
        static void parseUrl(ref Feed f,String source)
        {
             HtmlDocument doc = new HtmlWeb().Load(source);
             var d = doc.DocumentNode.SelectSingleNode(@"//div[@id='artext1']");
             f.Description = (d==null)?"":d.InnerText;
            var attr=doc.DocumentNode.SelectSingleNode(@"//div[@class='mainimg1']//img");
             f.ImageUri=(attr==null)?"":attr.Attributes["src"].Value;
        }
    }
    class Feed
    {
        public String Title="", ImageUri="", Description="",Uri="",Category="";
        public DateTime Time=DateTime.Now;
    }
    class categoryFeed
    {
        public String source, category;
    }
}