Learn 2 CSharp 11

11. Input is HTML table, Remove all tags and put data in a comma/tab separated file.

Part of this was an exercise in looking up what others have already done.

using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; using System.Data; using System.Text.RegularExpressions; using System.Threading; namespace lrn2CSharp11 { class Program { static void Main( string[ ] args ) { string file = ""; string html = ""; DataSet ds = null; StringBuilder csv = new StringBuilder( ); Console.WriteLine( "Input name of a file with an HTML table in the current directory to convert to csv." ); Console.Write( ":" ); try { file = Console.ReadLine( ); FileStream fs = new FileStream( file, FileMode.Open, FileAccess.Read ); StreamReader sr = new StreamReader( fs ); html = sr.ReadToEnd( ); sr.Close( ); StreamWriter sw = new StreamWriter( fs.Name + ".csv" ); ds = ConvertHTMLTablesToDataSet( html ); if ( ds != null ) { foreach ( DataTable dtc in ds.Tables ) { int iColCount = dtc.Columns.Count; for ( int i = 0; i < iColCount; i++ ) { sw.Write( dtc.Columns[ i ] ); if ( i < iColCount - 1 ) { sw.Write( "," ); } } sw.WriteLine( ); foreach ( DataRow dr in dtc.Rows ) { for ( int i = 0; i < iColCount; i++ ) { if ( !Convert.IsDBNull( dr[ i ] ) ) { sw.Write( dr[ i ].ToString( ) ); } if ( i < iColCount - 1 ) { sw.Write( "," ); } } sw.WriteLine( ); } sw.WriteLine( ); } } sw.Close( ); fs.Close( ); } catch { Console.WriteLine( "WTF?!?" ); } Console.WriteLine( "Press esc to exit." ); while ( !keyPressHandler( Console.ReadKey( true ) ) ) { Thread.Sleep( 250 ); /* no op */ } } private static Boolean keyPressHandler( ConsoleKeyInfo input ) { if ( input.Key == ConsoleKey.Escape ) return true; return false; } private static DataSet ConvertHTMLTablesToDataSet( string HTML ) { DataTable dt; DataSet ds = new DataSet( ); dt = new DataTable( ); string TableExpression = "<table[^>]*>(.*?)</table>"; string HeaderExpression = "<th[^>]*>(.*?)</th>"; string RowExpression = "<tr[^>]*>(.*?)</tr>"; string ColumnExpression = "<td[^>]*>(.*?)</td>"; bool HeadersExist = false; int iCurrentColumn = 0; int iCurrentRow = 0; MatchCollection Tables = Regex.Matches( HTML, TableExpression, RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase ); foreach ( Match Table in Tables ) { iCurrentRow = 0; HeadersExist = false; dt = new DataTable( ); if ( Table.Value.Contains( "<th" ) ) { HeadersExist = true; MatchCollection Headers = Regex.Matches( Table.Value, HeaderExpression, RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase ); foreach ( Match Header in Headers ) { dt.Columns.Add( Header.Groups[ 1 ].ToString( ) ); } } else { int columns = Regex.Matches( Regex.Matches( Regex.Matches( Table.Value, TableExpression, RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase )[ 0 ].ToString( ), RowExpression, RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase )[ 0 ].ToString( ), ColumnExpression, RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase ).Count; for ( int iColumns = 1; iColumns <= columns; iColumns++ ) { dt.Columns.Add( "Column " + System.Convert.ToString( iColumns ) ); } } MatchCollection Rows = Regex.Matches( Table.Value, RowExpression, RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase ); foreach ( Match Row in Rows ) { if ( !( ( iCurrentRow == 0 ) & HeadersExist ) ) { DataRow dr = dt.NewRow( ); iCurrentColumn = 0; MatchCollection Columns = Regex.Matches( Row.Value, ColumnExpression, RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase ); foreach ( Match Column in Columns ) { dr[ iCurrentColumn ] = Column.Groups[ 1 ].ToString( ); iCurrentColumn++; } dt.Rows.Add( dr ); } iCurrentRow++; } ds.Tables.Add( dt ); } return ds; } } }

posted by dharh 11:42 PM May 29th, 2011



2024: 1
2023: 4 2 1
2022: 5 3
2011: 5 3 1
2010: 12 9 7 1
2009: 12 11 8 5
2008: 12 5 4 3 2 1
2007: 12 11 10 9 8 7 6 5 4 3 2 1
2006: 12 11 10 9 8 7 6 5 4 3 2 1
2005: 12 10 7 6
2004: 10 9 6 5 4 3 2 1