Source code

001package Torello.HTML;
002
003import java.io.*;
004import java.util.Vector;
005import java.net.URL;
006
007import Torello.JavaDoc.Excuse;
008import Torello.Java.UnreachableError;
009
010/**
011 * Java HTML's flagship-parser class for converting HTML web-pages into plain Java {@code Vector's}
012 * of {@link HTMLNode}.
013 * 
014 * <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE>
015 * 
016 * @see Scrape#getHTML(BufferedReader, int, int)
017 * @see Scrape#getHTML(BufferedReader, String, String)
018 * @see HTMLPageMWT
019 */
020@Torello.JavaDoc.StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON)
021@Torello.JavaDoc.JDHeaderBackgroundImg
022public class HTMLPage
023{
024    private HTMLPage() { }
025
026    /**
027     * A function-pointer / lambda-target that (could) potentially be used to replace this
028     * library's current regular-expression based parser with something possibly faster or even
029     * more efficient.
030     * 
031     * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_PARSER>
032     * @see #parser
033     */
034    @FunctionalInterface
035    public static interface Parser
036    {
037        /**
038         * Parse html source-text into a {@code Vector<HTMLNode>}.
039         * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
040         * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
041         * 
042         * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
043         * 
044         * <BR /><BR /><DIV CLASS=JDHint> If you have decided to implement a parser, and you wish
045         * to ingore this parameter (and don't want to output such a file) - it is (hopefully)
046         * obvious that you may skip this step!</DIV>
047         * 
048         * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
049         * <BR /><BR /><DIV CLASS=JDHint><B>As above,</B> you may skip implementing this.</DIV>
050         * 
051         * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
052         * <BR /><BR /><DIV CLASS=JDHint><B>As above,</B> you may skip implementing this.</DIV>
053         * 
054         * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
055         * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
056         */
057        public Vector<HTMLNode> parse(
058                CharSequence    html,
059                boolean         eliminateHTMLTags,
060                String          rawHTMLFile,
061                String          matchesFile,
062                String          justTextFile
063            )
064        throws IOException;
065    }
066
067    /**
068     * If needing to "swap a proprietary parser" comes up, this is possible.
069     * It just needs to accept the same parameters as the current parser, and produce a 
070     * {@code Vector<HTMLNode>.}  This is not an advised step to take, but if an alternative
071     * parser has been tested and happens to be generating different results, it can be easily
072     * 'swapped out' for the one used now.
073     * @see Parser
074     * @see Parser#parse
075     */
076    public static Parser parser = ParserRE::parsePageTokens;
077
078
079    // ********************************************************************************************
080    // ********************************************************************************************
081    // These 6 functions presume that the HTML source needs to be downloaded & read from a URL
082    // ********************************************************************************************
083    // ********************************************************************************************
084
085
086    /**
087     * Convenience Method.
088     * <BR />Accepts: {@code URL}
089     * <BR />Passes null to parameters
090     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
091     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
092     *      String, String, String, String, String)}
093     * <BR />And Invokes: {@link Scrape#openConn(URL)}
094     */
095    public static Vector<HTMLNode> getPageTokens
096        (URL url, boolean eliminateHTMLTags)
097        throws IOException
098    {
099        return getPageTokens
100            (Scrape.openConn(url), eliminateHTMLTags, null, null, null, null, null);
101    }
102    
103    /**
104     * Convenience Method.
105     * <BR />Accepts: {@code URL}
106     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
107     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
108     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
109     *      String, String, String, String, String)}
110     * <BR />And Invokes: {@link Scrape#openConn(URL)}
111     */       
112    public static Vector<HTMLNode> getPageTokens
113        (URL url, boolean eliminateHTMLTags, String startTag, String endTag)
114        throws IOException
115    {
116        return getPageTokens
117            (Scrape.openConn(url), eliminateHTMLTags, startTag, endTag, null, null, null);
118    }
119    
120    /**
121     * Convenience Method.
122     * <BR />Accepts: {@code URL}
123     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
124     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
125     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
126     *      int, int, String, String, String)}
127     * <BR />And Invokes: {@link Scrape#openConn(URL)}
128     */
129    public static Vector<HTMLNode> getPageTokens
130        (URL url, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
131        throws IOException
132    {
133        return getPageTokens
134            (Scrape.openConn(url), eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
135    }
136
137    /**
138     * Convenience Method.
139     * <BR />Accepts: {@code URL}
140     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
141     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
142     *      String, String, String, String, String)}
143     * <BR />And Invokes: {@link Scrape#openConn(URL)}
144     */
145    public static Vector<HTMLNode> getPageTokens(
146            URL url, boolean eliminateHTMLTags,
147            String rawHTMLFile, String matchesFile, String justTextFile
148        )
149        throws IOException
150    {
151        return getPageTokens(
152            Scrape.openConn(url), eliminateHTMLTags,
153            null, null,
154            rawHTMLFile, matchesFile, justTextFile
155        );
156    }
157    
158    /**
159     * Convenience Method.
160     * <BR />Accepts: {@code URL}
161     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
162     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
163     *      String, String, String, String, String)}
164     * <BR />And Invokes: {@link Scrape#openConn(URL)}
165     */
166    public static Vector<HTMLNode> getPageTokens(
167            URL url, boolean eliminateHTMLTags,
168            String startTag, String endTag,
169            String rawHTMLFile, String matchesFile, String justTextFile
170        )
171        throws IOException
172    {
173        return getPageTokens(
174            Scrape.openConn(url), eliminateHTMLTags,
175            startTag, endTag,
176            rawHTMLFile, matchesFile, justTextFile
177        );
178    }
179    
180    /**
181     * Convenience Method.
182     * <BR />Accepts: {@code URL}
183     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
184     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
185     *      int, int, String, String, String)}
186     * <BR />And Invokes: {@link Scrape#openConn(URL)}
187     */
188    public static Vector<HTMLNode> getPageTokens(
189            URL url, boolean eliminateHTMLTags,
190            int startLineNum, int endLineNum,
191            String rawHTMLFile, String matchesFile, String justTextFile
192        )
193        throws IOException
194    {
195        return getPageTokens(
196            Scrape.openConn(url), eliminateHTMLTags,
197            startLineNum, endLineNum,
198            rawHTMLFile, matchesFile, justTextFile
199        );
200    }
201
202
203    // ********************************************************************************************
204    // ********************************************************************************************
205    // These 6 functions presume that the HTML source is from a CharSequence
206    // ********************************************************************************************
207    // ********************************************************************************************
208
209
210    /**
211     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
212     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
213     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
214     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
215     * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any
216     * checked-exceptions, there is no Input-Output involved here, it is strictly a computational
217     * method that neither invokes the file-system, nor the web.
218     */
219    public static Vector<HTMLNode> getPageTokens
220        (CharSequence html, boolean eliminateHTMLTags)
221        // NO IOException... NO I/O!
222    {
223        try
224            { return parser.parse(html, eliminateHTMLTags, null, null, null); }
225
226        // This should never happen, when reading from a 'String' rather than a URL, or
227        // BufferedReader ==> IOException will not be thrown.
228
229        catch (IOException ioe)
230            { throw new UnreachableError(ioe); }
231    }
232
233    /**
234     * Convenience Method.
235     * <BR />Accepts: {@code CharSequence}
236     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
237     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
238     * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean,
239     *      String, String, String, String, String)}
240     * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't
241     * possible!
242     */
243    public static Vector<HTMLNode> getPageTokens
244        (CharSequence html, boolean eliminateHTMLTags, String startTag, String endTag)
245    // NO IOException... NO I/O!
246    {
247        try
248            { return getPageTokens(html, eliminateHTMLTags, startTag, endTag, null, null, null); }
249
250        // This should never happen, when reading from a 'String' rather than a URL, or
251        // BufferedReader ==> IOException will not be thrown.
252
253        catch (IOException ioe)
254            { throw new UnreachableError(ioe); }
255    }
256    
257    /**
258     * Convenience Method.
259     * <BR />Accepts: {@code CharSequence}
260     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
261     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
262     * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean,
263     *      int, int, String, String, String)}
264     * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't
265     * possible!
266     */
267    public static Vector<HTMLNode> getPageTokens
268        (CharSequence html, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
269        // NO IOException... NO I/O!
270    {
271        try
272        { 
273            return getPageTokens
274                (html, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
275        }
276
277        // This should never happen, when reading from a 'String' rather than a URL, or
278        // BufferedReader ==> IOException will not be thrown.
279
280        catch (IOException ioe)
281            { throw new UnreachableError(ioe); }
282    }
283
284    /**
285     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
286     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
287     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
288     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
289     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
290     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
291     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
292     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
293     */
294    public static Vector<HTMLNode> getPageTokens(
295            CharSequence html, boolean eliminateHTMLTags,
296            String rawHTMLFile, String matchesFile, String justTextFile
297        )
298        throws IOException
299    { return parser.parse(html, eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile); }
300
301    /**
302     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
303     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
304     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
305     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
306     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
307     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
308     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
309     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
310     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
311     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
312     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
313     */
314    public static Vector<HTMLNode> getPageTokens(
315            CharSequence html, boolean eliminateHTMLTags,
316            String startTag, String endTag,
317            String rawHTMLFile, String matchesFile, String justTextFile
318        )
319        throws IOException
320    {
321        String  htmlStr = html.toString();
322
323        int sPos = htmlStr.indexOf(startTag);
324
325        if (sPos == -1) throw new IllegalArgumentException
326            ("Passed String-Parameter 'startTag' [" + startTag + "] was not found in HTML.");
327
328        int ePos = htmlStr.indexOf(endTag, sPos);
329
330        if (ePos == -1) throw new IllegalArgumentException
331            ("Passed String-Parameter 'endTag' [" + endTag + "] was not found in HTML.");
332
333        ePos += endTag.length();
334
335        return parser.parse(
336            htmlStr.substring(sPos, ePos), eliminateHTMLTags,
337            rawHTMLFile, matchesFile, justTextFile
338        );
339    }
340    
341    /**
342     * Convenience Method.
343     * <BR />Accepts: {@code CharSequence}
344     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
345     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
346     *      int, int, String, String, String)}
347     */
348    public static Vector<HTMLNode> getPageTokens(
349            CharSequence html, boolean eliminateHTMLTags,
350            int startLineNum, int endLineNum,
351            String rawHTMLFile, String matchesFile, String justTextFile
352        ) 
353        throws IOException
354    {
355        return getPageTokens(
356            new BufferedReader(new StringReader(html.toString())),
357            eliminateHTMLTags, startLineNum, endLineNum, rawHTMLFile, matchesFile, justTextFile
358        );
359    }
360
361
362    // ********************************************************************************************
363    // ********************************************************************************************
364    // The next 6 functions presume that the input is from a BufferedReader
365    // ********************************************************************************************
366    // ********************************************************************************************
367
368
369    /**
370     * Convenience Method.
371     * <BR />Accepts: {@code BufferedReader}
372     * <BR />Passes null to parameters
373     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
374     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
375     *      String, String, String, String, String)}
376     */
377    public static Vector<HTMLNode> getPageTokens
378        (BufferedReader br, boolean eliminateHTMLTags)
379        throws IOException
380    { return getPageTokens(br, eliminateHTMLTags, null, null, null, null, null); }
381
382    /**
383     * Convenience Method.
384     * <BR />Accepts: {@code BufferedReader}
385     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
386     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
387     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
388     *      String, String, String, String, String)}
389     */ 
390    public static Vector<HTMLNode> getPageTokens
391        (BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag)
392        throws IOException
393    { return getPageTokens(br, eliminateHTMLTags, startTag, endTag, null, null, null); }
394
395    /**
396     * Convenience Method.
397     * <BR />Accepts: {@code BufferedReader}
398     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
399     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
400     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
401     *      int, int, String, String, String)}
402     */
403    public static Vector<HTMLNode> getPageTokens
404        (BufferedReader br, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
405        throws IOException
406    { return getPageTokens(br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); }
407
408
409    /**
410     * Convenience Method.
411     * <BR />Accepts: {@code BufferedReader}
412     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
413     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
414     *      String, String, String, String, String)}
415     */
416    public static Vector<HTMLNode> getPageTokens(
417            BufferedReader br, boolean eliminateHTMLTags,
418            String rawHTMLFile, String matchesFile, String justTextFile
419        )
420        throws IOException
421    {
422        return getPageTokens
423            (br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile);
424    }
425
426
427    // ********************************************************************************************
428    // ********************************************************************************************
429    // 
430    // ********************************************************************************************
431    // ********************************************************************************************
432
433
434    /**
435     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
436     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
437     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
438     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
439     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
440     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
441     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
442     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
443     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
444     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
445     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
446     */
447    public static Vector<HTMLNode> getPageTokens(
448            BufferedReader br, boolean eliminateHTMLTags,
449            String startTag, String endTag,
450            String rawHTMLFile, String matchesFile, String justTextFile
451        )
452        throws IOException
453    {
454        return parser.parse(
455            Scrape.getHTML(br, startTag, endTag), eliminateHTMLTags, rawHTMLFile,
456            matchesFile, justTextFile
457        );
458    }
459
460    /**
461     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
462     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
463     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
464     * @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN>
465     * @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN>
466     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
467     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
468     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
469     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
470     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
471     * @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX>
472     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1>
473     */
474    public static Vector<HTMLNode> getPageTokens(
475            BufferedReader br, boolean eliminateHTMLTags,
476            int startLineNum, int endLineNum,
477            String rawHTMLFile, String matchesFile, String justTextFile
478        )
479        throws IOException
480    {
481        return parser.parse(
482            Scrape.getHTML(br, startLineNum, endLineNum), eliminateHTMLTags,
483            rawHTMLFile, matchesFile, justTextFile
484        );
485    }
486}