Then I looked at some code that uses regular expressions and it looked inefficient. If I wanted to scrape multiple tags then I would need lots of passes with the regular expressions. And what if I was parsing malformed HTML? The code I looked at wouldn't handle that. If I'm looking at scraping with lots of recursion, this could could be executed millions of times.
Here's the rough draft of my parser method, which takes a block of HTML as a string and builds a list of links. The next version will scrape links out of content too, not just tags. Anything that looks like a link and can be verified.
Thoughts? I'm inclined to think my code would execute faster than plucking the tags I want out using regex. It seems to work well.
Code: Select all
public void ParsePage(string scrapeUrl, string page)
{
try
{
string curTagName = "";
string curAttrName = "";
string curAttrVal = "";
Dictionary<string, string> attributes = null;
foreach (char c in page)
{
switch (c)
{
case '<':
// Change state regardless of current state.
NewState(WebScraperStates.InTag);
attributes = new Dictionary<string, string>();
curTagName = "";
break;
case '>':
NewState(WebScraperStates.InContent);
string tagName = curTagName.ToLower();
if (( tagName == "a" || tagName == "img") && attributes != null && attributes.Count > 0)
{
try
{
string link = "";
if (tagName == "a")
{
link = attributes["href"];
}
if (tagName == "img")
{
link = attributes["src"];
}
if (link.Length > 0)
{
link = link.StartsWith("//") ? "http:" + link : link;
link = link.StartsWith("/") ? scrapeUrl + link : link;
if (!link.StartsWith("http://") && !link.StartsWith("https://"))
{
link = MergeUrl(scrapeUrl, link);
}
NewWebLink(link, tagName, "", "", "");
}
}
catch (Exception)
{
}
}
if (tagName == "meta")
{
}
break;
case '=':
if (ScrapeState == WebScraperStates.InTagAttr)
{
NewState(WebScraperStates.InAttrValStart);
}
break;
case '\"':
switch (ScrapeState)
{
case WebScraperStates.InAttrValStart:
NewState(WebScraperStates.InAttrValContent);
curAttrVal = "";
break;
// Add a new attribute to the list.
case WebScraperStates.InAttrValContent:
NewState(WebScraperStates.InTag);
attributes.Add(curAttrName, curAttrVal);
break;
}
break;
case ' ':
if (ScrapeState == WebScraperStates.InTag)
{
// Have the tag name, now get the attributes.
NewState(WebScraperStates.InTagAttr);
curAttrName = "";
}
break;
default:
switch (ScrapeState)
{
// Add the character to the tag name.
case WebScraperStates.InTag:
curTagName += c;
break;
// Add the character to the attribute name.
case WebScraperStates.InTagAttr:
curAttrName += c;
break;
// Add the character to the attribute value.
case WebScraperStates.InAttrValContent:
curAttrVal += c;
break;
}
break;
}
}
}
catch (Exception)
{
}
}
Code: Select all
static class LinkFinder
{
public static List<LinkItem> Find(string file)
{
List<LinkItem> list = new List<LinkItem>();
// 1.
// Find all matches in file.
MatchCollection m1 = Regex.Matches(file, @"(<a.*?>.*?</a>)",
RegexOptions.Singleline);
// 2.
// Loop over each match.
foreach (Match m in m1)
{
string value = m.Groups[1].Value;
LinkItem i = new LinkItem();
// 3.
// Get href attribute.
Match m2 = Regex.Match(value, @"href=\""(.*?)\""",
RegexOptions.Singleline);
if (m2.Success)
{
i.Href = m2.Groups[1].Value;
}
// 4.
// Remove inner tags from text.
string t = Regex.Replace(value, @"\s*<.*?>\s*", "",
RegexOptions.Singleline);
i.Text = t;
list.Add(i);
}
return list;
}
}