package Torello.HTML;
import java.io.*;
import java.util.Vector;
import java.net.URL;
import Torello.HTML.HTMLPage.Parser;
import java.util.concurrent.*;
import java.util.concurrent.locks.*;
import Torello.JavaDoc.Excuse;
/**
* A carbon-copy of class {@link HTMLPage}, augmented with a mechanism for setting <B>a timeout</B>
* so that when scraping web-pages and {@code URL's} from servers that might have a tendency to hang,
* freeze, or delay - the Java Virtual Machine can skip and move-on when that timeout expires.
*
* <BR /><BR />
* <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MWT>
* <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE>
*
* @see Scrape#getHTML(BufferedReader, int, int)
* @see Scrape#getHTML(BufferedReader, String, String)
* @see HTMLPage
*/
@Torello.JavaDoc.StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON)
@Torello.JavaDoc.JDHeaderBackgroundImg
public class HTMLPageMWT
{
private HTMLPageMWT() { }
/**
* If needing to "swap a proprietary parser" comes up, this is possible.
* It just needs to accept the same parameters as the current parser, and produce a
* {@code Vector<HTMLNode>.} This is not an advised step to take, but if an alternative
* parser has been tested and happens to be generating different results, it can be easily
* 'swapped out' for the one used now.
* @see HTMLPage.Parser
* @see HTMLPage.Parser#parse(CharSequence, boolean, String, String, String)
*/
public static Parser parser = ParserRE::parsePageTokens;
// ********************************************************************************************
// ********************************************************************************************
// These 6 functions presume that the HTML source is from a URL
// ********************************************************************************************
// ********************************************************************************************
/**
* Convenience Method.
* <BR />Accepts: {@code URL} and Time-Out Parameters {@code 'timeout' & 'unit'}
* <BR />Passes null to parameters
* {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
* <BR />Invokes: {@link #getPageTokens(long, TimeUnit, URL, boolean, String, String,
* String, String, String)}
*/
public static Vector<HTMLNode> getPageTokens
(long timeout, TimeUnit unit, URL url, boolean eliminateHTMLTags)
throws IOException, InterruptedException
{
return getPageTokens(timeout, unit, url, eliminateHTMLTags, null, null, null, null, null);
}
/**
* Convenience Method.
* <BR />Accepts: {@code URL} and Time-Out Parameters {@code 'timeout' & 'unit'}
* <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
* <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
* <BR />Invokes: {@link #getPageTokens(long, TimeUnit, URL, boolean, String, String,
* String, String, String)}
*/
public static Vector<HTMLNode> getPageTokens(
long timeout, TimeUnit unit,
URL url, boolean eliminateHTMLTags,
String startTag, String endTag
)
throws IOException, InterruptedException
{
return getPageTokens
(timeout, unit, url, eliminateHTMLTags, startTag, endTag, null, null, null);
}
/**
* Convenience Method.
* <BR />Accepts: {@code URL} and Time-Out Parameters {@code 'timeout' & 'unit'}
* <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
* <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
* <BR />Invokes: {@link #getPageTokens(long, TimeUnit, URL, boolean, int, int,
* String, String, String)}
*/
public static Vector<HTMLNode> getPageTokens(
long timeout, TimeUnit unit,
URL url, boolean eliminateHTMLTags,
int startLineNum, int endLineNum
)
throws IOException, InterruptedException
{
return getPageTokens
(timeout, unit, url, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
}
/**
* Convenience Method.
* <BR />Accepts: {@code URL} and Time-Out Parameters {@code 'timeout' & 'unit'}
* <BR />Passes null to {@code startTag} & {@code endTag} parameters.
* <BR />Invokes: {@link #getPageTokens(long, TimeUnit, URL, boolean, String, String,
* String, String, String)}
*/
public static Vector<HTMLNode> getPageTokens(
long timeout, TimeUnit unit,
URL url, boolean eliminateHTMLTags,
String rawHTMLFile, String matchesFile, String justTextFile
)
throws IOException, InterruptedException
{
return getPageTokens(
timeout, unit, url, eliminateHTMLTags, null, null,
rawHTMLFile, matchesFile, justTextFile
);
}
// ********************************************************************************************
// ********************************************************************************************
// The next 6 functions presume that the input is from a BufferedReader
// ********************************************************************************************
// ********************************************************************************************
/**
* Convenience Method.
* <BR />Accepts: {@code BufferedReader}
* <BR />Passes null to parameters
* {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
* <BR />Invokes: {@link #getPageTokens(long, TimeUnit, BufferedReader, boolean,
* String, String, String, String, String)}
*/
public static Vector<HTMLNode> getPageTokens
(long timeout, TimeUnit unit, BufferedReader br, boolean eliminateHTMLTags)
throws IOException, InterruptedException
{
return getPageTokens
(timeout, unit, br, eliminateHTMLTags, null, null, null, null, null);
}
/**
* Convenience Method.
* <BR />Accepts: {@code BufferedReader}
* <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
* <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
* <BR />Invokes: {@link #getPageTokens(long, TimeUnit, BufferedReader, boolean,
* String, String, String, String, String)}
*/
public static Vector<HTMLNode> getPageTokens(
long timeout, TimeUnit unit,
BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag
)
throws IOException, InterruptedException
{
return getPageTokens
(timeout, unit, br, eliminateHTMLTags, startTag, endTag, null, null, null);
}
/**
* Convenience Method.
* <BR />Accepts: {@code BufferedReader}
* <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
* <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
* <BR />Invokes: {@link #getPageTokens(long, TimeUnit, BufferedReader, boolean,
* int, int, String, String, String)}
*/
public static Vector<HTMLNode> getPageTokens(
long timeout, TimeUnit unit,
BufferedReader br, boolean eliminateHTMLTags,
int startLineNum, int endLineNum
)
throws IOException, InterruptedException
{
return getPageTokens
(timeout, unit, br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
}
/**
* Convenience Method.
* <BR />Accepts: {@code BufferedReader}
* <BR />Passes null to {@code startTag} & {@code endTag} parameters.
* <BR />Invokes: {@link #getPageTokens(long, TimeUnit, BufferedReader, boolean,
* String, String, String, String, String)}
*/
public static Vector<HTMLNode> getPageTokens(
long timeout, TimeUnit unit,
BufferedReader br, boolean eliminateHTMLTags,
String rawHTMLFile, String matchesFile, String justTextFile
)
throws IOException, InterruptedException
{
return getPageTokens
(timeout, unit, br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile);
}
// ********************************************************************************************
// ********************************************************************************************
// * Receives a "pre-instantiated" BufferedReader for the HTML Source parameter
// ********************************************************************************************
// ********************************************************************************************
private static final ExecutorService executor = Executors.newCachedThreadPool();
private static final Lock lock = new ReentrantLock();
/**
* If this class has been used to make "multi-threaded" calls that use a Time-Out wait-period,
* you might see your Java-Program hang for a few seconds when you would expect it to exit back
* to your O.S. normally.
*
* <BR /><BR /><B>Max Wait Time</B> operates by building a "Timeout & Monitor" thread, and
* therefore when a program you have written yourself reaches the end of its code, <I><B>if you
* have performed any Internet-Downloads using {@code class HTMLPageMWT}</B></I>, then your
* program <I>might not exit immediately,</I> but rather sit at the command-prompt for anywhere
* between 10 and 30 seconds before this Timeout-Thread, created in class HTMLPageMWT, dies.
*
* <BR /><BR /><B CLASS=JDDescLabel>Multi-Threaded:</B>
*
* <BR />You may also immediately terminate any additional threads that were started by using
* this method.
*/
public static void shutdownMWTThreads() { executor.shutdownNow(); }
/**
* Parses and Vectorizes HTML from a {@code BufferedReader} source.
* Spawns a <I>monitor-thread</I> that stops the download if a
* certain, user-specified, time-limit is exceeded.
* @param timeout <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_TIMEOUT>
* @param unit <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_UNIT>
* @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
* @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
* @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
* @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
* @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
* @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
* @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
* @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
* @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
* @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
* @throws InterruptedException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IEX>
* @throws RejectedExecutionException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_REEX>
*/
public static Vector<HTMLNode> getPageTokens(
long timeout, TimeUnit unit,
BufferedReader br, boolean eliminateHTMLTags,
String startTag, String endTag,
String rawHTMLFile, String matchesFile, String justTextFile
)
throws IOException, InterruptedException
{
Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
{
public Vector<HTMLNode> call() throws Exception
{
return parser.parse(
Scrape.getHTML(br, startTag, endTag),
eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
);
}
};
lock.lock();
Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
lock.unlock();
try
{ return future.get(timeout, unit); }
catch (TimeoutException e) { return null; }
catch (ExecutionException e)
{
Throwable originalException = e.getCause();
if (originalException == null) throw new RejectedExecutionException(
"An Execution Exception was thrown, but it did provide a cause throwable " +
"(e.getCause() returned null). See this exception's getCause() method to " +
"view the ExecutionException that has occurred.",
e
);
if (originalException instanceof IOException)
throw (IOException) originalException;
if (originalException instanceof RuntimeException)
throw (RuntimeException) originalException;
throw new RejectedExecutionException(
"An Execution Exception occurred, but it was neither a RuntimeException, " +
"nor IOException. See this exception's getCause() method to view the " +
"underlying error that has occurred.", originalException
);
}
}
/**
* Parses and Vectorizes HTML from a {@code BufferedReader} source.
* Spawns a <I>monitor-thread</I> that stops the download if a
* certain, user-specified, time-limit is exceeded.
* @param timeout <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_TIMEOUT>
* @param unit <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_UNIT>
* @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
* @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
* @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN>
* @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN>
* @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
* @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
* @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
* @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
* @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX>
* @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1>
* @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
* @throws InterruptedException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IEX>
* @throws RejectedExecutionException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_REEX>
*/
public static Vector<HTMLNode> getPageTokens(
long timeout, TimeUnit unit,
BufferedReader br, boolean eliminateHTMLTags,
int startLineNum, int endLineNum,
String rawHTMLFile, String matchesFile, String justTextFile
)
throws IOException, InterruptedException
{
Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
{
public Vector<HTMLNode> call() throws Exception
{
return parser.parse(
Scrape.getHTML(br, startLineNum, endLineNum),
eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
);
}
};
lock.lock();
Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
lock.unlock();
try
{ return future.get(timeout, unit); }
catch (TimeoutException e) { return null; }
catch (ExecutionException e)
{
Throwable originalException = e.getCause();
if (originalException == null) throw new RejectedExecutionException(
"An Execution Exception was thrown, but it did provide a cause throwable " +
"(e.getCause() returned null). See this exception's getCause() method to " +
"view the ExecutionException has that occurred.",
e
);
if (originalException instanceof IOException)
throw (IOException) originalException;
if (originalException instanceof RuntimeException)
throw (RuntimeException) originalException;
throw new RejectedExecutionException(
"An Execution Exception occurred, but it was neither a RuntimeException, nor " +
"IOException. See this exception's getCause() method to view the underlying " +
"error that has occurred.", originalException
);
}
}
// ********************************************************************************************
// ********************************************************************************************
// Receives a java.net.URL for the HTML Source parameter -- that could Timeout/Hang
// ********************************************************************************************
// ********************************************************************************************
// It must be opened within the Multi-Threaded "Timeout" code (and therefore requires a second
// version of these two methods - where Scrape.openConn(url) is *inside* the monitored
// downloading thread.
/**
* Parses and Vectorizes HTML from a URL source.
* Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
* time-limit is exceeded.
* @param timeout <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_TIMEOUT>
* @param unit <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_UNIT>
* @param url <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_URL>
* @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
* @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
* @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
* @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
* @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
* @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
* @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
* @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
* @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
* @throws InterruptedException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IEX>
* @throws RejectedExecutionException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_REEX>
*/
public static Vector<HTMLNode> getPageTokens(
long timeout, TimeUnit unit,
URL url, boolean eliminateHTMLTags,
String startTag, String endTag,
String rawHTMLFile, String matchesFile, String justTextFile
)
throws IOException, InterruptedException
{
Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
{
public Vector<HTMLNode> call() throws Exception
{
return parser.parse(
Scrape.getHTML(Scrape.openConn(url), startTag, endTag),
eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
);
}
};
lock.lock();
Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
lock.unlock();
try
{ return future.get(timeout, unit); }
catch (TimeoutException e) { return null; }
catch (ExecutionException e)
{
Throwable originalException = e.getCause();
if (originalException == null) throw new RejectedExecutionException(
"An Execution Exception was thrown, but it did provide a cause throwable " +
"(e.getCause() returned null). See this exception's getCause() method to " +
"view the ExecutionException that has occurred.", e
);
if (originalException instanceof IOException)
throw (IOException) originalException;
if (originalException instanceof RuntimeException)
throw (RuntimeException) originalException;
throw new RejectedExecutionException(
"An Execution Exception occurred, but it was neither a RuntimeException, " +
"nor IOException. See this exception's getCause() method to view the " +
"underlying error that has occurred.", originalException
);
}
}
/**
* Parses and Vectorizes HTML from a URL source.
* Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
* time-limit is exceeded.
* @param timeout <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_TIMEOUT>
* @param unit <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_UNIT>
* @param url <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_URL>
* @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
* @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN>
* @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN>
* @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
* @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
* @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
* @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
* @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX>
* @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1>
* @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
* @throws InterruptedException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IEX>
* @throws RejectedExecutionException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_REEX>
*/
public static Vector<HTMLNode> getPageTokens(
long timeout, TimeUnit unit,
URL url, boolean eliminateHTMLTags,
int startLineNum, int endLineNum,
String rawHTMLFile, String matchesFile, String justTextFile
)
throws IOException, InterruptedException
{
Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
{
public Vector<HTMLNode> call() throws Exception
{
return parser.parse(
Scrape.getHTML(Scrape.openConn(url), startLineNum, endLineNum),
eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
);
}
};
lock.lock();
Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
lock.unlock();
try
{ return future.get(timeout, unit); }
catch (TimeoutException e) { return null; }
catch (ExecutionException e)
{
Throwable originalException = e.getCause();
if (originalException == null) throw new RejectedExecutionException(
"An Execution Exception was thrown, but it did provide a cause throwable " +
"(e.getCause() returned null). See this exception's getCause() method to " +
"view the ExecutionException has that occurred.",
e
);
if (originalException instanceof IOException)
throw (IOException) originalException;
if (originalException instanceof RuntimeException)
throw (RuntimeException) originalException;
throw new RejectedExecutionException(
"An Execution Exception occurred, but it was neither a RuntimeException, nor " +
"IOException. See this exception's getCause() method to view the underlying " +
"error that has occurred.", originalException
);
}
}
}