1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97 | package Torello.HTML;
import Torello.HTML.*;
import Torello.Java.FileRW;
import java.util.*;
import java.io.*;
import java.net.*;
/**
* Demonstrates using 'Splash,' which is one of many ways to execute the Java-Script on
* Web-Pages, before those pages are parsed.
*
* <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SPLASH_BRIDGE>
*/
public class SplashBridge
{
private SplashBridge() { }
/**
* Once the {@code Splash HTTP Server} is running (which requires the {@code Docker} loading
* and installation tool, all one has to do is <I><B>prepend this {@code String}</B></I> to
* any {@code URL}, and the {@code Splash Script Executor} will be invoked on the HTML and
* Script that is received from that {@code URL};
*
* <DIV CLASS="EXAMPLE">{@code
* String myURL = "https://cars.com";
* URL withSplashServerURL = new URL(SplashBridge.SPLASH_URL + myURL);
*
* // Here, just use the standard HTML scrape and parsing routines to retrieve the HTML
* // from the URL 'myURL'. Splash will execute any 'dynamic HTML' that is loaded via the
* // standard script libraries like AJAX, JSON, React-JS, jQuery, or Angular.
*
* Vector<HTMLNode> html = HTMLPage.getPageTokens(withSplashServerURL, false);
*
* // NOTE: The above invocation will not call the "www.cars.com" server, BUT RATHER, will
* // ask the HTTP Server running on the local host as a PROXY to retrieve the HTML
* // from "www.cars.com". Before returning that HTML, the local proxy server will also
* // execute the dynamic-loading script that is present on the main page of "cars.com"
* //
* // ALSO: There are other libraries that perform this type of work: Selenium, and Android
* // class WebView.
* }</DIV>
*/
public static final String SPLASH_URL = "http://localhost:8050/render.html?url=";
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=SPLASH_DOCKER>
* @throws IOException If there are any {@code HTTP} errors when downloading or processing
* the HTML.
*/
public static void example01() throws IOException
{
// Call the splash-bridge running on local-host @ port 8050
// The "wait" parameter means it will wait up to four seconds to run java-script AJAX
// data-retrieval tasks that are on the page.
String urlStr =
"http://localhost:8050/render.html?url=" +
"https://en.wikipedia.org/w/index.php?title=Christopher_Columbus&oldid=924321156" +
"&timeout=10&wait=4.0";
URL url = new URL(urlStr);
// This will just use the standard Java HTTP URLConnection class to connect to the exact
// same page.
String urlStr2 = "https://en.wikipedia.org/w/index.php?title=Christopher_Columbus&oldid=924321156";
URL url2 = new URL(urlStr2);
// Download both versions. This version is contacting a Splash Server on a local host
// running @ port 8050
// NOTE: This writes the HTML to a Flat-File on the File-System.
Vector<HTMLNode> v = HTMLPage.getPageTokens(url, false);
FileRW.writeFile(Util.pageToString(v), "cc.html");
// This version is contacting Wikipedia.com, and ignoring any possible AJAX or Java-Script
// calls - script calls of any kind are being ignored by this version.
// NOTE: This writes the HTML to a Flat-File on the File-System.
Vector<HTMLNode> v2 = HTMLPage.getPageTokens(url2, false);
FileRW.writeFile(Util.pageToString(v2), "cc2.html");
// FileOutput Size: Version 1: 650737 Nov 4 18:28 cc.html
// FileOutput Size: Version 2: 493879 Nov 4 18:28 cc2.html
// RESULTS: Clearly there is quite a bit of downloaded data from AJAX & Splash
}
// public static void main(String[] argv) throws IOException { example02(); }
}
|