Learn 2 CSharp 11
11. Input is HTML table, Remove all tags and put data in a comma/tab separated file.
Part of this was an exercise in looking up what others have already done. using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Data;
using System.Text.RegularExpressions;
using System.Threading;
namespace lrn2CSharp11
class Program
static void Main( string[ ] args )
string file = "";
string html = "";
DataSet ds = null;
StringBuilder csv = new StringBuilder( );
Console.WriteLine( "Input name of a file with an HTML table in the current directory to convert to csv." );
Console.Write( ":" );
file = Console.ReadLine( );
FileStream fs = new FileStream( file, FileMode.Open, FileAccess.Read );
StreamReader sr = new StreamReader( fs );
html = sr.ReadToEnd( );
sr.Close( );
StreamWriter sw = new StreamWriter( fs.Name + ".csv" );
ds = ConvertHTMLTablesToDataSet( html );
if ( ds != null )
foreach ( DataTable dtc in ds.Tables )
int iColCount = dtc.Columns.Count;
for ( int i = 0; i < iColCount; i++ )
sw.Write( dtc.Columns[ i ] );
if ( i < iColCount - 1 )
sw.Write( "," );
sw.WriteLine( );
foreach ( DataRow dr in dtc.Rows )
for ( int i = 0; i < iColCount; i++ )
if ( !Convert.IsDBNull( dr[ i ] ) )
sw.Write( dr[ i ].ToString( ) );
if ( i < iColCount - 1 )
sw.Write( "," );
sw.WriteLine( );
sw.WriteLine( );
sw.Close( );
fs.Close( );
Console.WriteLine( "WTF?!?" );
Console.WriteLine( "Press esc to exit." );
while ( !keyPressHandler( Console.ReadKey( true ) ) )
Thread.Sleep( 250 );
/* no op */
private static Boolean keyPressHandler( ConsoleKeyInfo input )
if ( input.Key == ConsoleKey.Escape )
return true;
return false;
private static DataSet ConvertHTMLTablesToDataSet( string HTML )
DataTable dt;
DataSet ds = new DataSet( );
dt = new DataTable( );
string TableExpression = "<table[^>]*>(.*?)</table>";
string HeaderExpression = "<th[^>]*>(.*?)</th>";
string RowExpression = "<tr[^>]*>(.*?)</tr>";
string ColumnExpression = "<td[^>]*>(.*?)</td>";
bool HeadersExist = false;
int iCurrentColumn = 0;
int iCurrentRow = 0;
MatchCollection Tables = Regex.Matches(
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
foreach ( Match Table in Tables )
iCurrentRow = 0;
HeadersExist = false;
dt = new DataTable( );
if ( Table.Value.Contains( "<th" ) )
HeadersExist = true;
MatchCollection Headers = Regex.Matches(
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
foreach ( Match Header in Headers )
dt.Columns.Add( Header.Groups[ 1 ].ToString( ) );
int columns = Regex.Matches(
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
)[ 0 ].ToString( ),
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
)[ 0 ].ToString( ),
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
for ( int iColumns = 1; iColumns <= columns; iColumns++ )
dt.Columns.Add( "Column " + System.Convert.ToString( iColumns ) );
MatchCollection Rows = Regex.Matches(
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
foreach ( Match Row in Rows )
if ( !( ( iCurrentRow == 0 ) & HeadersExist ) )
DataRow dr = dt.NewRow( );
iCurrentColumn = 0;
MatchCollection Columns = Regex.Matches(
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
foreach ( Match Column in Columns )
dr[ iCurrentColumn ] = Column.Groups[ 1 ].ToString( );
dt.Rows.Add( dr );
ds.Tables.Add( dt );
return ds;
posted by dharh 11:42 PM May 29th, 2011
