Source code

001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.HTMLNode;
004import Torello.HTML.HTMLPage;
005import Torello.HTML.HTTPCodes;
006import Torello.HTML.URLFilter;
007import Torello.HTML.Links;
008import Torello.HTML.TagNode;
009
010import Torello.HTML.NodeSearch.InnerTagGet;
011
012import Torello.Java.StrPrint;
013import Torello.Java.StrCmpr;
014import Torello.Java.StrIndent;
015
016import static Torello.Java.C.RESET;
017import static Torello.Java.C.BRED;
018import static Torello.Java.C.BYELLOW;
019
020import Torello.Java.Additional.Ret2;
021import Torello.Java.Additional.URLs;
022
023import Torello.JavaDoc.Excuse;
024import Torello.JavaDoc.StaticFunctional;
025
026import java.util.Vector;
027import java.io.IOException;
028import java.util.stream.Stream;
029import java.util.stream.Collectors;
030
031import java.net.URL;
032import java.net.MalformedURLException;
033
034/**
035 * Collects all <B>news-article {@code URL's}</B> from a news oriented web-site's main web-page
036 * and from the list 'sub-section' web-pages.
037 * 
038 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_URLS>
039 */
040@StaticFunctional(Excused="SKIP_ON_SECTION_URL_EXCEPTION", Excuses=Excuse.CONFIGURATION)
041public class ScrapeURLs
042{
043    private ScrapeURLs() { }
044
045    /** <EMBED CLASS='external-html' DATA-FILE-ID=SU_SKIP_OSUE> */
046    public static boolean SKIP_ON_SECTION_URL_EXCEPTION = true;
047
048    /**
049     * Convenience Method.
050     * <BR />Invokes: {@link #get(Vector, URLFilter, LinksGet, Appendable)}
051     */
052    public static Vector<Vector<String>> get(NewsSite ns, Appendable log) throws IOException
053    { return get(ns.sectionURLsVec(), ns.filter, ns.linksGetter, log); }
054
055    /**
056     * This class is used to retrieve <B>all</B> of the available article {@code URL} links found
057     * on <B>all sections</B> of a newspaper website.
058     * 
059     * @param sectionURLs       <EMBED CLASS='external-html' DATA-FILE-ID=SU_SEC_URLS>
060     * @param articleURLFilter  <EMBED CLASS='external-html' DATA-FILE-ID=SU_ARTICLE_URLF>
061     * @param linksGetter       <EMBED CLASS='external-html' DATA-FILE-ID=SU_LINKS_GET>
062     * 
063     * @param log This prints log information to the screen.  This parameter may not be null,
064     * or a {@code NullPointerException} will throw.
065     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
066     *
067     * @return                      <EMBED CLASS='external-html' DATA-FILE-ID=SU_RETURNS>
068     * @throws SectionURLException  <EMBED CLASS='external-html' DATA-FILE-ID=SU_SEC_URL_EX>
069     */
070    public static Vector<Vector<String>> get(
071            Vector<URL> sectionURLs, URLFilter articleURLFilter, LinksGet linksGetter,
072            Appendable log
073        )
074    {
075        LOG_WRITE(
076            log,
077            "\n" + BRED +
078            "*****************************************************************************************\n" +
079            "*****************************************************************************************\n" + 
080            RESET + " Finding Article URL's in Newspaper Sections" + BRED + "\n" +
081            "*****************************************************************************************\n" +
082            "*****************************************************************************************\n" + 
083            RESET + '\n'
084        );
085
086        final Vector<Vector<String>> ret = new Vector<>();
087
088        for (URL sectionURL : sectionURLs)
089        {
090            Stream<String> urlStream;
091
092            // It helps to run this, because web-pages can use a lot of strings
093            System.gc();
094
095            // Starting Scraping the Section for URL's
096            LOG_WRITE(log, "Visiting Section URL: " + sectionURL.toString() + '\n');
097
098            try
099            {
100                // Download, Scrape & Parse the main-page or section URL.
101                Vector<HTMLNode> sectionPage = HTMLPage.getPageTokens(sectionURL, false);
102
103
104                // If the 'LinksGet' instances is null, then select all URL's on the main-page
105                // section-pge, and pray for rain (hope for the best).  If no 'LinksGet' instance
106                // was provided, there will likely be many spurious / irrelevant links to
107                // non-article pages, and even advertisement pages that are also included in this
108                // Stream<String>.
109                // 
110                // InnerTagGet returns a Vector<TagNode>.  Convert that to a Stream<String>, where
111                // each 'String' in the 'Stream' is the HREF attribute of the <A HREF=...> tag.
112
113                if (linksGetter == null)
114                    urlStream = InnerTagGet.all(sectionPage, "a", "href")
115                        .stream().map((TagNode tn) -> tn.AV("href"));
116
117                else 
118                    urlStream = linksGetter.apply(sectionURL, sectionPage).stream();
119            }
120
121            catch (Exception e)
122            {
123                LOG_WRITE(
124                    log,
125                    BRED + "Error loading this main-section page-URL\n" + RESET +
126                    e.getMessage() + '\n'
127                );
128
129                if (SKIP_ON_SECTION_URL_EXCEPTION)
130                {
131                    LOG_WRITE(log, "Non-fatal Exception, continuing to next Section URL.\n\n");
132                    continue;
133                }
134
135                else
136                {
137                    LOG_WRITE(
138                        log,
139                        BRED + "Fatal - Exiting.  Top-Level Section URL's must be valid URL's." +
140                        RESET + "\n" + HTTPCodes.convertMessageVerbose(e, sectionURL, 0) + '\n'
141                    );
142
143                    throw new SectionURLException
144                        ("Invalid Main Section URL: " + sectionURL.toString(), e);
145                }
146            }
147
148            Vector<String> sectionArticleURLs = urlStream
149
150                // If any TagNode's did not have HREF-Attributes, remove those null-values
151                .filter ((String href)  -> (href != null))
152
153                // Perform a Standard String.trim() operation.
154                .map    ((String href) -> href.trim())
155
156                // Any HREF's that are "just white-space" are now removed.
157                .filter ((String href) -> href.length() > 0)
158
159
160                // This removes any HREF Attribute values that begin with
161                // "mailto:" "tel:" "javascript:" "magnet:" etc...
162
163                .filter ((String href) -> StrCmpr.startsWithNAND_CI(href, Links.NON_URL_HREFS()))
164
165                // Now, Resolve any "Partial URL References"
166                .map ((String href) -> Links.resolve_KE(href, sectionURL))
167                                                     
168
169                // If there were any exceptions while performing the Partial-URL Resolve-Operation,
170                // then print an error message.
171
172                .peek ((Ret2<URL, MalformedURLException> r2) ->
173                {
174                    if (r2.b != null) LOG_WRITE(
175                        log,
176                        "Section URL was a malformed URL, and provided exception messsage:\n" +
177                        r2.b.getMessage() + '\n'
178                    );
179                })
180
181                // Convert the Ret2<URL, Exception> to just the URL, without any Exceptions
182                .map ((Ret2<URL, MalformedURLException> r2) -> r2.a)
183
184                // If there was an exception, the URL Ret.a field would be null (remove nulls)
185                .filter ((URL url) -> url != null)
186
187
188                // NOTE: When this evaluates to TRUE - it should be kept
189                // Java Stream's 'filter' method states that when the predicate evaluates to TRUE,
190                // the stream element is KEPT / RETAINED.
191                // 
192                // Class URLFilter mimics the filter behavior of Streams.filter(...)
193
194                .filter ((URL url) -> (articleURLFilter == null) || articleURLFilter.test(url))
195
196
197                // Convert these to "Standard Strings"
198                //      Case-Insensitive parts are set to LowerCase
199                //      Case Sensitive Parts are left alone.
200
201                .map ((URL url) -> URLs.urlToString(url))
202
203
204                // Filter any duplicates -> This is the reason for the above case-sensitive parts
205                // being separated.
206
207                .distinct()
208
209
210                // Convert the URL's back to a String. There really should not be any exceptions,
211                // This is just an "extra-careful" step.  It is not needed.
212
213                .filter ((String url) ->
214                    { try { new URL(url); return true; } catch (Exception e) { return false; } })
215
216                // Convert the Stream to a Vector
217                .collect(Collectors.toCollection(Vector::new));
218
219            ret.add(sectionArticleURLs);
220
221            LOG_WRITE(
222                log,
223                "Found [" + BYELLOW + sectionArticleURLs.size() + RESET + "] " +
224                "Article Links.\n\n"
225            );
226        }
227
228
229        // Provide a simple count to the log output on how many URL's have been uncovered.
230        // NOTE: This does not heed whether different sections contain non-distinct URL's.
231        //       (An identical URL found in two different sections will be counted twice!)
232
233        int totalURLs = 0;
234
235        // <?> Prevents the "Xlint:all" from generating warnings...
236        for (Vector<?> section : ret) totalURLs += section.size();
237
238        LOG_WRITE(
239            log,
240            "Complete Possible Article URL list has: " + 
241            BYELLOW + StrPrint.zeroPad10e4(totalURLs) + RESET + ' ' +
242            "url(s).\n\n"
243        );
244
245        return ret;
246    }
247
248    // This is necessary because Java's 'java.lang.Appendable' permits a IOException
249    private static void LOG_WRITE(Appendable log, String s)
250    {
251        try
252            { log.append(s); }
253
254        catch (Exception e)
255        {
256            System.out.println(
257                "While trying to write to the log, an exception occurred\n" +
258                "the java.lang.Appendable you have provided threw an IOException:\n" +
259                StrIndent.indent(e.getMessage(), 4) +
260                "Unfortunaely, with a faulty Appendable-Log, the scraper cannot continue."
261            );
262
263            e.printStackTrace();
264
265            System.out.println("Fatal Error, JVM Exiting...");
266
267            System.exit(1);
268        }
269    }
270}