Sample code for 30+ languages & platforms
Unicode C

HTML Table to CSV

See more HTML-to-XML/Text Examples

Demonstrates a method for converting an HTML table to a CSV file.

Note: This example requires Chilkat v9.5.0.77 or greater.

Chilkat Unicode C Downloads

Unicode C
#include <C_CkHttpW.h>
#include <C_CkBinDataW.h>
#include <C_CkHtmlToXmlW.h>
#include <C_CkStringBuilderW.h>
#include <C_CkXmlW.h>
#include <C_CkCsvW.h>

void ChilkatSample(void)
    {
    BOOL success;
    HCkHttpW http;
    HCkBinDataW bdHtml;
    HCkHtmlToXmlW htx;
    HCkStringBuilderW sbXml;
    HCkXmlW xml;
    int numRemoved;
    int i;
    int count_i;
    const wchar_t *table_role;
    const wchar_t *table_data_scrollx;
    const wchar_t *table_data_sortdirection;
    const wchar_t *table_data_sorton;
    const wchar_t *table_id;
    int j;
    int count_j;
    const wchar_t *tr_role;
    int k;
    int count_k;
    const wchar_t *tagPath;
    const wchar_t *text;
    HCkCsvW csv;
    const wchar_t *csvStr;

    success = FALSE;

    // This example requires the Chilkat API to have been previously unlocked.
    // See Global Unlock Sample for sample code.

    // First download the HTML containing the table
    http = CkHttpW_Create();
    bdHtml = CkBinDataW_Create();

    success = CkHttpW_QuickGetBd(http,L"https://example-code.com/data/etf_table.html",bdHtml);
    if (success != TRUE) {
        wprintf(L"%s\n",CkHttpW_lastErrorText(http));
        CkHttpW_Dispose(http);
        CkBinDataW_Dispose(bdHtml);
        return;
    }

    // Convert to XML.
    htx = CkHtmlToXmlW_Create();
    CkHtmlToXmlW_SetHtmlBd(htx,bdHtml);

    sbXml = CkStringBuilderW_Create();
    CkHtmlToXmlW_ToXmlSb(htx,sbXml);

    xml = CkXmlW_Create();
    CkXmlW_LoadSb(xml,sbXml,TRUE);

    // Remove attributes and sub-trees we don't need.
    // (In other words, we're getting rid of clutter...)
    numRemoved = CkXmlW_PruneTag(xml,L"thead");
    numRemoved = CkXmlW_PruneAttribute(xml,L"style");
    numRemoved = CkXmlW_PruneAttribute(xml,L"class");

    // Scrub the element and attribute content.
    CkXmlW_Scrub(xml,L"ContentTrimEnds,ContentTrimInside,AttrTrimEnds,AttrTrimInside");

    // Let's see what we have...
    wprintf(L"%s\n",CkXmlW_getXml(xml));

    // We have the following XML.
    // Copy this XML into the online tool at Generate Parsing Code from XML
    // as a starting point for accessing the data..

    // <?xml version="1.0" encoding="utf-8"?>
    // <root>
    //     <html>
    //         <head>
    //             <meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
    //         </head>
    //         <body text="#000000" bgcolor="#FFFFFF">
    //             <div>
    //                 <div>
    //                     <table role="grid" data-scrollx="true" data-sortdirection="desc" data-sorton="-1"/>
    //                 </div>
    //             </div>
    //             <div>
    //                 <table id="topHoldingsTable" role="grid" data-scrollx="true" data-sortdirection="desc" data-sorton="-1">
    //                     <tbody>
    //                         <tr role="row">
    //                             <td>
    //                                 <text>ITUB4</text>
    //                             </td>
    //                             <td>
    //                                 <text>ITAU UNIBANCO HOLDING PREF SA</text>
    //                             </td>
    //                             <td>
    //                                 <text>Financials</text>
    //                             </td>
    //                             <td>
    //                                 <text>Brazil</text>
    //                             </td>
    //                             <td>
    //                                 <text>10.94</text>
    //                             </td>
    //                             <td>
    //                                 <text>998,954,813.73</text>
    //                             </td>
    //                         </tr>
    //                         <tr role="row">
    //                             <td>
    //                                 <text>BBDC4</text>
    //                             </td>
    //                             <td>
    //                                 <text>BANCO BRADESCO PREF SA</text>
    //                             </td>
    //                             <td>
    //                                 <text>Financials</text>
    //                             </td>
    //                             <td>
    //                                 <text>Brazil</text>
    //                             </td>
    //                             <td>
    //                                 <text>9.01</text>
    //                             </td>
    //                             <td>
    //                                 <text>822,164,622.75</text>
    //                             </td>
    //                         </tr>
    // 			...
    // 			...
    // 			...
    //                     </tbody>
    //                 </table>
    //             </div>
    //         </body>
    //     </html>
    // </root>

    // 
    // This is the code generated by the online tool:
    // 

    i = 0;
    count_i = CkXmlW_NumChildrenHavingTag(xml,L"html|body|div");
    while (i < count_i) {
        CkXmlW_putI(xml,i);
        table_role = CkXmlW_chilkatPath(xml,L"html|body|div[i]|div|table|(role)");
        table_data_scrollx = CkXmlW_chilkatPath(xml,L"html|body|div[i]|div|table|(data-scrollx)");
        table_data_sortdirection = CkXmlW_chilkatPath(xml,L"html|body|div[i]|div|table|(data-sortdirection)");
        table_data_sorton = CkXmlW_chilkatPath(xml,L"html|body|div[i]|div|table|(data-sorton)");
        table_id = CkXmlW_chilkatPath(xml,L"html|body|div[i]|table|(id)");
        table_role = CkXmlW_chilkatPath(xml,L"html|body|div[i]|table|(role)");
        table_data_scrollx = CkXmlW_chilkatPath(xml,L"html|body|div[i]|table|(data-scrollx)");
        table_data_sortdirection = CkXmlW_chilkatPath(xml,L"html|body|div[i]|table|(data-sortdirection)");
        table_data_sorton = CkXmlW_chilkatPath(xml,L"html|body|div[i]|table|(data-sorton)");
        j = 0;
        count_j = CkXmlW_NumChildrenHavingTag(xml,L"html|body|div[i]|table|tbody|tr");
        while (j < count_j) {
            CkXmlW_putJ(xml,j);
            tr_role = CkXmlW_chilkatPath(xml,L"html|body|div[i]|table|tbody|tr[j]|(role)");
            k = 0;
            count_k = CkXmlW_NumChildrenHavingTag(xml,L"html|body|div[i]|table|tbody|tr[j]|td");
            while (k < count_k) {
                CkXmlW_putK(xml,k);
                text = CkXmlW_getChildContent(xml,L"html|body|div[i]|table|tbody|tr[j]|td[k]|text");
                k = k + 1;
            }

            j = j + 1;
        }

        i = i + 1;
    }

    // Let's modify the above code to build the CSV.
    csv = CkCsvW_Create();
    CkCsvW_SetColumnName(csv,0,L"Ticker");
    CkCsvW_SetColumnName(csv,1,L"Name");
    CkCsvW_SetColumnName(csv,2,L"Sector");
    CkCsvW_SetColumnName(csv,3,L"Country");
    CkCsvW_SetColumnName(csv,4,L"Weight");
    CkCsvW_SetColumnName(csv,5,L"Notional Vaue");

    i = 0;
    count_i = CkXmlW_NumChildrenHavingTag(xml,L"html|body|div");
    while (i < count_i) {
        CkXmlW_putI(xml,i);
        j = 0;
        count_j = CkXmlW_NumChildrenHavingTag(xml,L"html|body|div[i]|table|tbody|tr");
        while (j < count_j) {
            CkXmlW_putJ(xml,j);
            k = 0;
            count_k = CkXmlW_NumChildrenHavingTag(xml,L"html|body|div[i]|table|tbody|tr[j]|td");
            while (k < count_k) {
                CkXmlW_putK(xml,k);
                CkCsvW_SetCell(csv,j,k,CkXmlW_getChildContent(xml,L"html|body|div[i]|table|tbody|tr[j]|td[k]|text"));
                k = k + 1;
            }

            j = j + 1;
        }

        i = i + 1;
    }

    CkCsvW_SaveFile(csv,L"qa_output/brasil_etf.csv");
    csvStr = CkCsvW_saveToString(csv);
    wprintf(L"%s\n",csvStr);

    // Our CSV looks like this:
    // Ticker,Name,Sector,Country,Weight,Notional Vaue
    // ITUB4,ITAU UNIBANCO HOLDING PREF SA,Financials,Brazil,10.94,"998,954,813.73"
    // BBDC4,BANCO BRADESCO PREF SA,Financials,Brazil,9.01,"822,164,622.75"
    // VALE3,CIA VALE DO RIO DOCE SH,Materials,Brazil,8.60,"785,290,260.07"
    // PETR4,PETROLEO BRASILEIRO PREF SA,Energy,Brazil,5.68,"518,124,434.10"
    // PETR3,PETROBRAS,Energy,Brazil,4.86,"443,254,438.53"
    // B3SA3,B3 BRASIL BOLSA BALCAO SA,Financials,Brazil,4.57,"417,636,740.16"
    // ABEV3,AMBEV SA,Consumer Staples,Brazil,4.57,"417,216,913.63"
    // BBAS3,BANCO DO BRASIL SA,Financials,Brazil,3.25,"296,921,232.15"
    // ITSA4,ITAUSA INVESTIMENTOS ITAU PREF SA,Financials,Brazil,2.90,"265,153,684.52"
    // LREN3,LOJAS RENNER SA,Consumer Discretionary,Brazil,2.25,"205,832,175.98"
    // 


    CkHttpW_Dispose(http);
    CkBinDataW_Dispose(bdHtml);
    CkHtmlToXmlW_Dispose(htx);
    CkStringBuilderW_Dispose(sbXml);
    CkXmlW_Dispose(xml);
    CkCsvW_Dispose(csv);

    }