Crawling is a thing which all search engines do across the web. This is a simple web crawler which crawls the the page you give and will give you back all the links on that page. Here for the sake of example I took Google.com.
using System; using System.Net; using System.Collections.Generic; using System.IO; using System.Text.RegularExpressions; public class Crawler { public static void Main() { string url = "http://www.google.com"; HttpWebRequest httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url); httpWebRequest.UserAgent = "Anurag's Crawler"; WebResponse webResponse = httpWebRequest.GetResponse(); Stream stream = webResponse.GetResponseStream(); StreamReader streamReader = new StreamReader(stream); string htmlText = streamReader.ReadToEnd(); var allLinks = GetNewLinks(htmlText); foreach (var link in allLinks) { Console.WriteLine(link); } } private static ListGetNewLinks(string content) { Regex regexForLink = new Regex("(?<=<a\\s*?href=(?:'|\"))[^'\"]*?(?=(?:'|\"))");
List<string> newLinks = new List<string>(); foreach (var match in regexLink.Matches(content)) { if (!newLinks.Contains(match.ToString())) newLinks.Add(match.ToString()); } return newLinks; } }
Web Crawler This is a crawler written using Reactive Extension
One more web crawler thats available and bit complex Archnode.net
No comments:
Post a Comment