Learn 2 CSharp 11
11. Input is HTML table, Remove all tags and put data in a comma/tab separated file.
Part of this was an exercise in looking up what others have already done. using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Data;
using System.Text.RegularExpressions;
using System.Threading;
namespace lrn2CSharp11
{
class Program
{
static void Main( string[ ] args )
{
string file = "";
string html = "";
DataSet ds = null;
StringBuilder csv = new StringBuilder( );
Console.WriteLine( "Input name of a file with an HTML table in the current directory to convert to csv." );
Console.Write( ":" );
try
{
file = Console.ReadLine( );
FileStream fs = new FileStream( file, FileMode.Open, FileAccess.Read );
StreamReader sr = new StreamReader( fs );
html = sr.ReadToEnd( );
sr.Close( );
StreamWriter sw = new StreamWriter( fs.Name + ".csv" );
ds = ConvertHTMLTablesToDataSet( html );
if ( ds != null )
{
foreach ( DataTable dtc in ds.Tables )
{
int iColCount = dtc.Columns.Count;
for ( int i = 0; i < iColCount; i++ )
{
sw.Write( dtc.Columns[ i ] );
if ( i < iColCount - 1 )
{
sw.Write( "," );
}
}
sw.WriteLine( );
foreach ( DataRow dr in dtc.Rows )
{
for ( int i = 0; i < iColCount; i++ )
{
if ( !Convert.IsDBNull( dr[ i ] ) )
{
sw.Write( dr[ i ].ToString( ) );
}
if ( i < iColCount - 1 )
{
sw.Write( "," );
}
}
sw.WriteLine( );
}
sw.WriteLine( );
}
}
sw.Close( );
fs.Close( );
}
catch
{
Console.WriteLine( "WTF?!?" );
}
Console.WriteLine( "Press esc to exit." );
while ( !keyPressHandler( Console.ReadKey( true ) ) )
{
Thread.Sleep( 250 );
/* no op */
}
}
private static Boolean keyPressHandler( ConsoleKeyInfo input )
{
if ( input.Key == ConsoleKey.Escape )
return true;
return false;
}
private static DataSet ConvertHTMLTablesToDataSet( string HTML )
{
DataTable dt;
DataSet ds = new DataSet( );
dt = new DataTable( );
string TableExpression = "<table[^>]*>(.*?)</table>";
string HeaderExpression = "<th[^>]*>(.*?)</th>";
string RowExpression = "<tr[^>]*>(.*?)</tr>";
string ColumnExpression = "<td[^>]*>(.*?)</td>";
bool HeadersExist = false;
int iCurrentColumn = 0;
int iCurrentRow = 0;
MatchCollection Tables = Regex.Matches(
HTML,
TableExpression,
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
);
foreach ( Match Table in Tables )
{
iCurrentRow = 0;
HeadersExist = false;
dt = new DataTable( );
if ( Table.Value.Contains( "<th" ) )
{
HeadersExist = true;
MatchCollection Headers = Regex.Matches(
Table.Value,
HeaderExpression,
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
);
foreach ( Match Header in Headers )
{
dt.Columns.Add( Header.Groups[ 1 ].ToString( ) );
}
}
else
{
int columns = Regex.Matches(
Regex.Matches(
Regex.Matches(
Table.Value,
TableExpression,
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
)[ 0 ].ToString( ),
RowExpression,
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
)[ 0 ].ToString( ),
ColumnExpression,
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
).Count;
for ( int iColumns = 1; iColumns <= columns; iColumns++ )
{
dt.Columns.Add( "Column " + System.Convert.ToString( iColumns ) );
}
}
MatchCollection Rows = Regex.Matches(
Table.Value,
RowExpression,
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
);
foreach ( Match Row in Rows )
{
if ( !( ( iCurrentRow == 0 ) & HeadersExist ) )
{
DataRow dr = dt.NewRow( );
iCurrentColumn = 0;
MatchCollection Columns = Regex.Matches(
Row.Value,
ColumnExpression,
RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase
);
foreach ( Match Column in Columns )
{
dr[ iCurrentColumn ] = Column.Groups[ 1 ].ToString( );
iCurrentColumn++;
}
dt.Rows.Add( dr );
}
iCurrentRow++;
}
ds.Tables.Add( dt );
}
return ds;
}
}
}
posted by dharh 11:42 PM May 29th, 2011
|
AI airships america batteries blogs books browser C# c++ chatGPT computers conversation copyright covid cpp cr-48 CSharp dharh disaster DIY DRM economy energy environment FCC gaming google government history HTML humor idt internet interview japan java javascript linkjack linux lrn2program MLP moving music nature nefeedeater neThing neTodo networking news opensource philosophy podcasts poverty programming projects python reading religion science sick simple software space sparce tagging technology twitter unbirthday video wiki
|