Parsing HTML tables into System.Data.DataTable
What follows is a quick and dirty class I made to parse HTML tables into DataTables. As usual, it is the result of internet search/run/bug fix/refactor.
In use, it looks little like this:
WebClient client = new WebClient(); string html = client.DownloadString(@"http://www.table.co.uk"); DataSet dataSet = HtmlTableParser.Parse(html); |
Here is the implementation. It’s not optimised for runtime performance, but it works.
/// <summary> /// HtmlTableParser parses the contents of an html string into a System.Data DataSet or DataTable. /// </summary> public class HtmlTableParser { private const RegexOptions ExpressionOptions = RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase; private const string CommentPattern = "<!--(.*?)-->"; private const string TablePattern = "<table[^>]*>(.*?)</table>"; private const string HeaderPattern = "<th[^>]*>(.*?)</th>"; private const string RowPattern = "<tr[^>]*>(.*?)</tr>"; private const string CellPattern = "<td[^>]*>(.*?)</td>"; /// <summary> /// Given an HTML string containing n table tables, parse them into a DataSet containing n DataTables. /// </summary> /// <param name="html">An HTML string containing n HTML tables</param> /// <returns>A DataSet containing a DataTable for each HTML table in the input HTML</returns> public static DataSet ParseDataSet(string html) { DataSet dataSet = new DataSet(); MatchCollection tableMatches = Regex.Matches( WithoutComments(html), TablePattern, ExpressionOptions); foreach (Match tableMatch in tableMatches) { dataSet.Tables.Add(ParseTable(tableMatch.Value)); } return dataSet; } /// <summary> /// Given an HTML string containing a single table, parse that table to form a DataTable. /// </summary> /// <param name="tableHtml">An HTML string containing a single HTML table</param> /// <returns>A DataTable which matches the input HTML table</returns> public static DataTable ParseTable(string tableHtml) { string tableHtmlWithoutComments = WithoutComments(tableHtml); DataTable dataTable = new DataTable(); MatchCollection rowMatches = Regex.Matches( tableHtmlWithoutComments, RowPattern, ExpressionOptions); dataTable.Columns.AddRange(tableHtmlWithoutComments.Contains("<th") ? ParseColumns(tableHtml) : GenerateColumns(rowMatches)); ParseRows(rowMatches, dataTable); return dataTable; } /// <summary> /// Strip comments from an HTML stirng /// </summary> /// <param name="html">An HTML string potentially containing comments</param> /// <returns>The input HTML string with comments removed</returns> private static string WithoutComments(string html) { return Regex.Replace(html, CommentPattern, string.Empty, ExpressionOptions); } /// <summary> /// Add a row to the input DataTable for each row match in the input MatchCollection /// </summary> /// <param name="rowMatches">A collection of all the rows to add to the DataTable</param> /// <param name="dataTable">The DataTable to which we add rows</param> private static void ParseRows(MatchCollection rowMatches, DataTable dataTable) { foreach (Match rowMatch in rowMatches) { // if the row contains header tags don't use it - it is a header not a row if (!rowMatch.Value.Contains("<th")) { DataRow dataRow = dataTable.NewRow(); MatchCollection cellMatches = Regex.Matches( rowMatch.Value, CellPattern, ExpressionOptions); for (int columnIndex = 0; columnIndex < cellMatches.Count; columnIndex++) { dataRow[columnIndex] = cellMatches[columnIndex].Groups[1].ToString(); } dataTable.Rows.Add(dataRow); } } } /// <summary> /// Given a string containing an HTML table, parse the header cells to create a set of DataColumns /// which define the columns in a DataTable. /// </summary> /// <param name="tableHtml">An HTML string containing a single HTML table</param> /// <returns>A set of DataColumns based on the HTML table header cells</returns> private static DataColumn[] ParseColumns(string tableHtml) { MatchCollection headerMatches = Regex.Matches( tableHtml, HeaderPattern, ExpressionOptions); return (from Match headerMatch in headerMatches select new DataColumn(headerMatch.Groups[1].ToString())).ToArray(); } /// <summary> /// For tables which do not specify header cells we must generate DataColumns based on the number /// of cells in a row (we assume all rows have the same number of cells). /// </summary> /// <param name="rowMatches">A collection of all the rows in the HTML table we wish to generate columns for</param> /// <returns>A set of DataColumns based on the number of celss in the first row of the input HTML table</returns> private static DataColumn[] GenerateColumns(MatchCollection rowMatches) { int columnCount = Regex.Matches( rowMatches[0].ToString(), CellPattern, ExpressionOptions).Count; return (from index in Enumerable.Range(0, columnCount) select new DataColumn("Column " + Convert.ToString(index))).ToArray(); } } |
As always, here are the tests. They yield 100% coverage but I still need to add some asserts on the column names.
/// <summary> /// Tests for the HtmlTableParser class /// </summary> [TestClass] public class ParserTest { private TestContext testContextInstance; /// <summary> /// Verify that HtmlTableParser can parse an HTML file containing a single table. The /// test file includes a commented out table which should be ignored. Note some tags use /// attributes (we test we can parse tags with and without attributes). /// </summary> [TestMethod] [DeploymentItem(@"data\singleTable.txt")] public void TestParseSingleTable() { string html = File.ReadAllText("singleTable.txt"); DataTable table = HtmlTableParser.ParseTable(html); AssertTable(GetExpectedData(), table); } /// <summary> /// Verify that HtmlTableParser can parse an HTML file containing multiple tables. The /// test file includes a commented out table which should be ignored. The test file /// contains tables both with and without headers. /// </summary> [TestMethod] [DeploymentItem(@"data\multipleTables.txt")] public void TestParseMultipleTables() { string html = File.ReadAllText("multipleTables.txt"); DataSet dataSet = HtmlTableParser.ParseDataSet(html); Assert.AreEqual(3, dataSet.Tables.Count); var expected = GetExpectedData(); foreach (DataTable table in dataSet.Tables) { AssertTable(expected, table); } } private static string[][] GetExpectedData() { return new[] { new[] { "row 1, cell 1", "row 1, cell 2" }, new[] { "row 2, cell 1", "row 2, cell 2" } }; } private static void AssertTable(string[][] expected, DataTable table) { Assert.AreEqual(expected.Count(), table.Rows.Count, "Table did not contain the expected number of rows"); for (int i = 0; i < expected.Count(); i++) { for (int j = 0; j < expected[i].Count(); j++) { string actualElement = (table.Rows[i][j] as string).Trim(); string expectedElement = expected[i][j]; Assert.AreEqual<string>(expectedElement, actualElement, "Table did not contain the expected element"); } } } } |
These are the test files, which are just some basic HMTL.
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
<meta name="robots" content="all" />
<title>Title</title>
</head>
<body>
<!-- This table is commented out, so it shouldn't be parsed.
<table border="1">
<tr border="1">
<th>Commented Heading 1</th>
<th>Commented Heading 2</th>
</tr>
<tr border="1">
<td>Commented row 1, cell 1</td>
<td>Commented row 1, cell 2</td>
</tr>
<tr border="1">
<td>Commented row 2, cell 1</td>
<td>Commented row 2, cell 2</td>
</tr>
</table>
-->
<!-- The parser should ignore the border attributes -->
<table border="1">
<tr border="1">
<th>Heading 1</th>
<th>Heading 2</th>
</tr>
<tr>
<td border="1">row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr border="1">
<td>row 2, cell 1</td>
<td>row 2, cell 2</td>
</tr>
</table>
</body>
</html> |
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
<meta name="robots" content="all" />
<title>Title</title>
</head>
<body>
<!-- The parser should ignore the border attributes -->
<table border="1">
<tr border="1">
<td>row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr border="1">
<td>row 2, cell 1</td>
<td>row 2, cell 2</td>
</tr>
</table>
<!-- This table is commented out, so it shouldn't be parsed.
<table border="1">
<tr border="1">
<th>Commented Heading 1</th>
<th>Commented Heading 2</th>
</tr>
<tr border="1">
<td>Commented row 1, cell 1</td>
<td>Commented row 1, cell 2</td>
</tr>
<tr border="1">
<td>Commented row 2, cell 1</td>
<td>Commented row 2, cell 2</td>
</tr>
</table>
-->
<table border="1">
<tr border="1">
<th>Heading 1</th>
<th>Heading 2</th>
</tr>
<tr border="1">
<td>row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr border="1">
<td>row 2, cell 1</td>
<td>row 2, cell 2</td>
</tr>
</table>
<table>
<tr>
<td>row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr border="1">
<td>row 2, cell 1</td>
<td border="1">row 2, cell 2</td>
</tr>
</table>
</body>
</html> |
It fails on nested tables. Try to grab this:
http://www.shoutcastunlimited.com:8230