Skip to content

Parsing HTML tables into System.Data.DataTable

by Alex Peck on May 6th, 2010

What follows is a quick and dirty class I made to parse HTML tables into DataTables. As usual, it is the result of internet search/run/bug fix/refactor.

In use, it looks little like this:

WebClient client = new WebClient();
string html = client.DownloadString(@"http://www.table.co.uk");
DataSet dataSet = HtmlTableParser.Parse(html);

Here is the implementation. It’s not optimised for runtime performance, but it works.

/// <summary>
/// HtmlTableParser parses the contents of an html string into a System.Data DataSet or DataTable.
/// </summary>
public class HtmlTableParser
{
    private const RegexOptions ExpressionOptions = RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase;
 
    private const string CommentPattern = "<!--(.*?)-->";
    private const string TablePattern = "<table[^>]*>(.*?)</table>";
    private const string HeaderPattern = "<th[^>]*>(.*?)</th>";
    private const string RowPattern = "<tr[^>]*>(.*?)</tr>";
    private const string CellPattern = "<td[^>]*>(.*?)</td>";
 
    /// <summary>
    /// Given an HTML string containing n table tables, parse them into a DataSet containing n DataTables.
    /// </summary>
    /// <param name="html">An HTML string containing n HTML tables</param>
    /// <returns>A DataSet containing a DataTable for each HTML table in the input HTML</returns>
    public static DataSet ParseDataSet(string html)
    {
        DataSet dataSet = new DataSet();
        MatchCollection tableMatches = Regex.Matches(
            WithoutComments(html),
            TablePattern,
            ExpressionOptions);
 
        foreach (Match tableMatch in tableMatches)
        {
            dataSet.Tables.Add(ParseTable(tableMatch.Value));
        }
 
        return dataSet;
    }
 
    /// <summary>
    /// Given an HTML string containing a single table, parse that table to form a DataTable.
    /// </summary>
    /// <param name="tableHtml">An HTML string containing a single HTML table</param>
    /// <returns>A DataTable which matches the input HTML table</returns>
    public static DataTable ParseTable(string tableHtml)
    {
        string tableHtmlWithoutComments = WithoutComments(tableHtml);
 
        DataTable dataTable = new DataTable();
 
        MatchCollection rowMatches = Regex.Matches(
            tableHtmlWithoutComments,
            RowPattern,
            ExpressionOptions);
 
        dataTable.Columns.AddRange(tableHtmlWithoutComments.Contains("<th")
                                       ? ParseColumns(tableHtml)
                                       : GenerateColumns(rowMatches));
 
        ParseRows(rowMatches, dataTable);
 
        return dataTable;
    }
 
    /// <summary>
    /// Strip comments from an HTML stirng
    /// </summary>
    /// <param name="html">An HTML string potentially containing comments</param>
    /// <returns>The input HTML string with comments removed</returns>
    private static string WithoutComments(string html)
    {
        return Regex.Replace(html, CommentPattern, string.Empty, ExpressionOptions);
    }
 
    /// <summary>
    /// Add a row to the input DataTable for each row match in the input MatchCollection
    /// </summary>
    /// <param name="rowMatches">A collection of all the rows to add to the DataTable</param>
    /// <param name="dataTable">The DataTable to which we add rows</param>
    private static void ParseRows(MatchCollection rowMatches, DataTable dataTable)
    {
        foreach (Match rowMatch in rowMatches)
        {
            // if the row contains header tags don't use it - it is a header not a row
            if (!rowMatch.Value.Contains("<th"))
            {
                DataRow dataRow = dataTable.NewRow();
 
                MatchCollection cellMatches = Regex.Matches(
                    rowMatch.Value,
                    CellPattern,
                    ExpressionOptions);
 
                for (int columnIndex = 0; columnIndex < cellMatches.Count; columnIndex++)
                {
                    dataRow[columnIndex] = cellMatches[columnIndex].Groups[1].ToString();
                }
 
                dataTable.Rows.Add(dataRow);
            }
        }
    }
 
    /// <summary>
    /// Given a string containing an HTML table, parse the header cells to create a set of DataColumns
    /// which define the columns in a DataTable.
    /// </summary>
    /// <param name="tableHtml">An HTML string containing a single HTML table</param>
    /// <returns>A set of DataColumns based on the HTML table header cells</returns>
    private static DataColumn[] ParseColumns(string tableHtml)
    {
        MatchCollection headerMatches = Regex.Matches(
            tableHtml,
            HeaderPattern,
            ExpressionOptions);
 
        return (from Match headerMatch in headerMatches
                select new DataColumn(headerMatch.Groups[1].ToString())).ToArray();
    }
 
    /// <summary>
    /// For tables which do not specify header cells we must generate DataColumns based on the number
    /// of cells in a row (we assume all rows have the same number of cells).
    /// </summary>
    /// <param name="rowMatches">A collection of all the rows in the HTML table we wish to generate columns for</param>
    /// <returns>A set of DataColumns based on the number of celss in the first row of the input HTML table</returns>
    private static DataColumn[] GenerateColumns(MatchCollection rowMatches)
    {
        int columnCount = Regex.Matches(
            rowMatches[0].ToString(),
            CellPattern,
            ExpressionOptions).Count;
 
        return (from index in Enumerable.Range(0, columnCount)
                select new DataColumn("Column " + Convert.ToString(index))).ToArray();
    }
}

As always, here are the tests. They yield 100% coverage but I still need to add some asserts on the column names.

/// <summary>
/// Tests for the HtmlTableParser class
/// </summary>
[TestClass]
public class ParserTest
{
    private TestContext testContextInstance;
 
    /// <summary>
    /// Verify that HtmlTableParser can parse an HTML file containing a single table. The
    /// test file includes a commented out table which should be ignored. Note some tags use
    /// attributes (we test we can parse tags with and without attributes).
    /// </summary>
    [TestMethod]
    [DeploymentItem(@"data\singleTable.txt")]
    public void TestParseSingleTable()
    {
        string html = File.ReadAllText("singleTable.txt");
        DataTable table = HtmlTableParser.ParseTable(html);
 
        AssertTable(GetExpectedData(), table);
    }
 
    /// <summary>
    /// Verify that HtmlTableParser can parse an HTML file containing multiple tables. The
    /// test file includes a commented out table which should be ignored. The test file
    /// contains tables both with and without headers.
    /// </summary>
    [TestMethod]
    [DeploymentItem(@"data\multipleTables.txt")]
    public void TestParseMultipleTables()
    {
        string html = File.ReadAllText("multipleTables.txt");
        DataSet dataSet = HtmlTableParser.ParseDataSet(html);
        Assert.AreEqual(3, dataSet.Tables.Count);
 
        var expected = GetExpectedData();
 
        foreach (DataTable table in dataSet.Tables)
        {
            AssertTable(expected, table);
        }
    }
 
    private static string[][] GetExpectedData()
    {
        return new[]
        {
            new[] { "row 1, cell 1", "row 1, cell 2" },
            new[] { "row 2, cell 1", "row 2, cell 2" }
        };
    }
 
    private static void AssertTable(string[][] expected, DataTable table)
    {
        Assert.AreEqual(expected.Count(), table.Rows.Count, "Table did not contain the expected number of rows");
 
        for (int i = 0; i < expected.Count(); i++)
        {
            for (int j = 0; j < expected[i].Count(); j++)
            {
                string actualElement = (table.Rows[i][j] as string).Trim();
                string expectedElement = expected[i][j];
 
                Assert.AreEqual<string>(expectedElement, actualElement, "Table did not contain the expected element");
            }
        }
    }
}

These are the test files, which are just some basic HMTL.

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
      <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
    <meta name="robots" content="all" />
      <title>Title</title>
</head>
 
<body>
      <!-- This table is commented out, so it shouldn't be parsed.
      <table border="1">
            <tr  border="1">
                  <th>Commented Heading 1</th>
                  <th>Commented Heading 2</th>
            </tr>
            <tr  border="1">
                  <td>Commented row 1, cell 1</td>
                  <td>Commented row 1, cell 2</td>
            </tr>
            <tr  border="1">
                  <td>Commented row 2, cell 1</td>
                  <td>Commented row 2, cell 2</td>
            </tr>
      </table>
      -->
 
      <!-- The parser should ignore the border attributes -->
      <table border="1">
            <tr  border="1">
                  <th>Heading 1</th>
                  <th>Heading 2</th>
            </tr>
            <tr>
                  <td border="1">row 1, cell 1</td>
                  <td>row 1, cell 2</td>
            </tr>
            <tr  border="1">
                  <td>row 2, cell 1</td>
                  <td>row 2, cell 2</td>
            </tr>
      </table> 
</body>
</html>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
      <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
    <meta name="robots" content="all" />
      <title>Title</title>
</head>
 
<body>
      <!-- The parser should ignore the border attributes -->
      <table border="1">
            <tr  border="1">
                  <td>row 1, cell 1</td>
                  <td>row 1, cell 2</td>
            </tr>
            <tr  border="1">
                  <td>row 2, cell 1</td>
                  <td>row 2, cell 2</td>
            </tr>
      </table> 
      <!-- This table is commented out, so it shouldn't be parsed.
      <table border="1">
            <tr  border="1">
                  <th>Commented Heading 1</th>
                  <th>Commented Heading 2</th>
            </tr>
            <tr  border="1">
                  <td>Commented row 1, cell 1</td>
                  <td>Commented row 1, cell 2</td>
            </tr>
            <tr  border="1">
                  <td>Commented row 2, cell 1</td>
                  <td>Commented row 2, cell 2</td>
            </tr>
      </table>
      -->
      <table border="1">
            <tr  border="1">
                  <th>Heading 1</th>
                  <th>Heading 2</th>
            </tr>
            <tr  border="1">
                  <td>row 1, cell 1</td>
                  <td>row 1, cell 2</td>
            </tr>
            <tr  border="1">
                  <td>row 2, cell 1</td>
                  <td>row 2, cell 2</td>
            </tr>
      </table> 
      <table>
            <tr>
                  <td>row 1, cell 1</td>
                  <td>row 1, cell 2</td>
            </tr>
            <tr  border="1">
                  <td>row 2, cell 1</td>
                  <td border="1">row 2, cell 2</td>
            </tr>
      </table> 
</body>
</html>
6 Comments
  1. Manic permalink

    It fails on nested tables. Try to grab this:
    http://www.shoutcastunlimited.com:8230

  2. Carlos permalink

    Hi Alex,

    I would like to know if your code is available for others to use, I have found this piece is just what I need for a project I am working on.

    Please let me know.

    Thanks.
    Carlos

  3. WizardOfWhiff permalink

    tags mess up the HeaderMatches. I’m sure there’s a nice regex way to take care of that, but I went quick and dirty and just stripped the before running the regex:

    private static DataColumn[] ParseColumns(string tableHtml)
    {
    // messes with our HeaderPattern, so strip it out
    string strippedHtml = tableHtml.Replace(“”, “”);

    MatchCollection headerMatches = Regex.Matches(
    strippedHtml,
    HeaderPattern,
    ExpressionOptions);

    return (from Match headerMatch in headerMatches
    select new DataColumn(headerMatch.Groups[1].ToString())).ToArray();
    }

  4. WizardOfWhiff permalink

    Where it looks like words are missing from my previous post, insert “thead” with angle brackets – particularly in the Replace method.

  5. Excelent!

    Your code help me a lot. Thanks.

  6. Thank you so much! it works great!

Leave a Reply

Note: XHTML is allowed. Your email address will never be published.

Subscribe to this comment feed via RSS