Java
Efficiently Process a Huge XML File

Demonstrates a technique for processing a huge XML file (can be any size, even many gigabytes).
Note: This example requires Chilkat v9.5.0.80 or greater.
Chilkat Java Downloads

Java
import com.chilkatsoft.*;

public class ChilkatExample {

  static {
    try {
        System.loadLibrary("chilkat");
    } catch (UnsatisfiedLinkError e) {
      System.err.println("Native code library failed to load.\n" + e);
      System.exit(1);
    }
  }

  public static void main(String argv[])
  {
    boolean success = false;

    //  This example shows a way to efficiently process a gigantic XML file -- one that may be too large
    //  to fit in memory.  
    //  
    //  Two types of XML parsers exist: DOM parsers and SAX parsers.

    //  A DOM parser is a Document Object Model parser, where the entire XML is loaded into memory
    //  and the application has the luxury of interacting with the XML in a convenient, random-access
    //  way.  The Chilkat Xml class is a DOM parser.  Because the entire XML is loaded into memory,
    //  huge XML files (on the order of gigabytes) are usually not loadable for memory constraints.

    //  A SAX parser is such that the XML file is parsed as an input stream.  No DOM exists.  
    //  Using a SAX parser is generally less palatable than using a DOM parser, for many reasons.
    //  
    //  The technique described here is a hybrid.  It streams the XML file as unstructured text
    //  to extract fragments that are individually treated as separate XML documents loaded into
    //  the Chilkat Xml parser.
    //  
    //  For example, imagine your XML file is several GBs in size, but has a relatively simple structure, such as:
    //  
    //  <Transactions>
    //      <Transaction id="1">
    //           ...
    //      </Transaction>
    //      <Transaction id="2">
    //           ...
    //      </Transaction>
    //      <Transaction id="3">
    //           ...
    //      </Transaction>
    //  ...
    //  </Transactions>

    //  In the following code, each <Transaction ...> ... </Transaction>
    //  is extracted and loaded separately into an Xml object, where it can be manipulated
    //  independently.  The entire XML file is never entirely loaded into memory.

    CkFileAccess fac = new CkFileAccess();

    success = fac.OpenForRead("qa_data/xml/transactions.xml");
    if (success == false) {
        System.out.println(fac.lastErrorText());
        return;
        }

    CkXml xml = new CkXml();
    CkStringBuilder sb = new CkStringBuilder();
    boolean firstIteration = true;
    int retval = 1;
    int numTransactions = 0;

    //  The begin marker is "XML tag aware".  If the begin marker begins with "<"
    //  and ends with ">", then it is assumed to be an XML tag and it will also match
    //  substrings where the ">" can be a whitespace char.
    String beginMarker = "<Transaction>";
    String endMarker = "</Transaction>";

    while (retval == 1) {
        sb.Clear();
        //  The retval can have the following values:
        //  0: No more fragments exist.
        //  1: Captured the next fragment.  The text from beginMarker to endMarker, including the markers, are returned in sb.
        //  -1: Error.
        retval = fac.ReadNextFragment(firstIteration,beginMarker,endMarker,"utf-8",sb);
        firstIteration = false;

        if (retval == 1) {
            numTransactions = numTransactions+1;
            success = xml.LoadSb(sb,true);
            //  Your application may now do what it needs with this particular XML fragment...
            }

        }

    if (retval < 0) {
        System.out.println(fac.lastErrorText());
        }

    System.out.println("numTransactions: " + numTransactions);
  }
}