SplashBridge.java.html

package Torello.HTML;

import Torello.HTML.*;
import Torello.Java.FileRW;
import java.util.*;
import java.io.*;
import java.net.*;

/**
 * Demonstrates using 'Splash,' which is one of many ways to execute the Java-Script on
 * Web-Pages, before those pages are parsed.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SPLASH_BRIDGE>
 */
public class SplashBridge
{
    private SplashBridge() { }

    /**
     * Once the {@code Splash HTTP Server} is running (which requires the {@code Docker} loading
     * and installation tool, all one has to do is <I><B>prepend this {@code String}</B></I> to
     * any {@code URL}, and the {@code Splash Script Executor} will be invoked on the HTML and
     * Script that is received from that {@code URL};
     * 
     * <DIV CLASS="EXAMPLE">{@code
     * String   myURL               = "https://cars.com";
     * URL      withSplashServerURL = new URL(SplashBridge.SPLASH_URL +  myURL);
     *
     * // Here, just use the standard HTML scrape and parsing routines to retrieve the HTML
     * // from the URL 'myURL'.  Splash will execute any 'dynamic HTML' that is loaded via the
     * // standard script libraries like AJAX, JSON, React-JS, jQuery, or Angular.
     *
     * Vector<HTMLNode> html = HTMLPage.getPageTokens(withSplashServerURL, false);
     * 
     * // NOTE: The above invocation will not call the "www.cars.com" server, BUT RATHER, will
     * //       ask the HTTP Server running on the local host as a PROXY to retrieve the HTML
     * //       from "www.cars.com".  Before returning that HTML, the local proxy server will also
     * //       execute the dynamic-loading script that is present on the main page of "cars.com"
     * // 
     * // ALSO: There are other libraries that perform this type of work: Selenium, and Android
     * //       class WebView.
     * }</DIV>
     */
    public static final String SPLASH_URL = "http://localhost:8050/render.html?url=";

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=SPLASH_DOCKER>
     * @throws IOException If there are any {@code HTTP} errors when downloading or processing
     * the HTML.
     */
    public static void example01() throws IOException
    {
        // Call the splash-bridge running on local-host @ port 8050
        // The "wait" parameter means it will wait up to four seconds to run java-script AJAX
        // data-retrieval tasks that are on the page.

        String urlStr =
            "http://localhost:8050/render.html?url=" + 
            "https://en.wikipedia.org/w/index.php?title=Christopher_Columbus&oldid=924321156" +
                "&timeout=10&wait=4.0";

        URL url = new URL(urlStr);


        // This will just use the standard Java HTTP URLConnection class to connect to the exact
        // same page.

        String urlStr2 = "https://en.wikipedia.org/w/index.php?title=Christopher_Columbus&oldid=924321156";

        URL url2 = new URL(urlStr2);


        // Download both versions.  This version is contacting a Splash Server on a local host
        // running @ port 8050
        // NOTE: This writes the HTML to a Flat-File on the File-System.

        Vector<HTMLNode> v = HTMLPage.getPageTokens(url, false);

        FileRW.writeFile(Util.pageToString(v), "cc.html");


        // This version is contacting Wikipedia.com, and ignoring any possible AJAX or Java-Script
        // calls - script calls of any kind are being ignored by this version.
        // NOTE: This writes the HTML to a Flat-File on the File-System.

        Vector<HTMLNode> v2 = HTMLPage.getPageTokens(url2, false);

        FileRW.writeFile(Util.pageToString(v2), "cc2.html");


        // FileOutput Size: Version 1: 650737 Nov  4 18:28 cc.html
        // FileOutput Size: Version 2: 493879 Nov  4 18:28 cc2.html
        // RESULTS: Clearly there is quite a bit of downloaded data from AJAX & Splash
    }

    // public static void main(String[] argv) throws IOException { example02(); }
}