001package Torello.HTML.Tools.NewsSite; 002 003import Torello.HTML.HTMLNode; 004import Torello.HTML.HTMLPage; 005import Torello.HTML.HTTPCodes; 006import Torello.HTML.URLFilter; 007import Torello.HTML.Links; 008import Torello.HTML.TagNode; 009 010import Torello.HTML.NodeSearch.InnerTagGet; 011 012import Torello.Java.StrPrint; 013import Torello.Java.StrCmpr; 014import Torello.Java.StrIndent; 015 016import static Torello.Java.C.RESET; 017import static Torello.Java.C.BRED; 018import static Torello.Java.C.BYELLOW; 019 020import Torello.Java.Additional.Ret2; 021import Torello.Java.Additional.URLs; 022 023import Torello.JavaDoc.Excuse; 024import Torello.JavaDoc.StaticFunctional; 025 026import java.util.Vector; 027import java.io.IOException; 028import java.util.stream.Stream; 029import java.util.stream.Collectors; 030 031import java.net.URL; 032import java.net.MalformedURLException; 033 034/** 035 * Collects all <B>news-article {@code URL's}</B> from a news oriented web-site's main web-page 036 * and from the list 'sub-section' web-pages. 037 * 038 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_URLS> 039 */ 040@StaticFunctional(Excused="SKIP_ON_SECTION_URL_EXCEPTION", Excuses=Excuse.CONFIGURATION) 041public class ScrapeURLs 042{ 043 private ScrapeURLs() { } 044 045 /** <EMBED CLASS='external-html' DATA-FILE-ID=SU_SKIP_OSUE> */ 046 public static boolean SKIP_ON_SECTION_URL_EXCEPTION = true; 047 048 /** 049 * Convenience Method. 050 * <BR />Invokes: {@link #get(Vector, URLFilter, LinksGet, Appendable)} 051 */ 052 public static Vector<Vector<String>> get(NewsSite ns, Appendable log) throws IOException 053 { return get(ns.sectionURLsVec(), ns.filter, ns.linksGetter, log); } 054 055 /** 056 * This class is used to retrieve <B>all</B> of the available article {@code URL} links found 057 * on <B>all sections</B> of a newspaper website. 058 * 059 * @param sectionURLs <EMBED CLASS='external-html' DATA-FILE-ID=SU_SEC_URLS> 060 * @param articleURLFilter <EMBED CLASS='external-html' DATA-FILE-ID=SU_ARTICLE_URLF> 061 * @param linksGetter <EMBED CLASS='external-html' DATA-FILE-ID=SU_LINKS_GET> 062 * 063 * @param log This prints log information to the screen. This parameter may not be null, 064 * or a {@code NullPointerException} will throw. 065 * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE> 066 * 067 * @return <EMBED CLASS='external-html' DATA-FILE-ID=SU_RETURNS> 068 * @throws SectionURLException <EMBED CLASS='external-html' DATA-FILE-ID=SU_SEC_URL_EX> 069 */ 070 public static Vector<Vector<String>> get( 071 Vector<URL> sectionURLs, URLFilter articleURLFilter, LinksGet linksGetter, 072 Appendable log 073 ) 074 { 075 LOG_WRITE( 076 log, 077 "\n" + BRED + 078 "*****************************************************************************************\n" + 079 "*****************************************************************************************\n" + 080 RESET + " Finding Article URL's in Newspaper Sections" + BRED + "\n" + 081 "*****************************************************************************************\n" + 082 "*****************************************************************************************\n" + 083 RESET + '\n' 084 ); 085 086 final Vector<Vector<String>> ret = new Vector<>(); 087 088 for (URL sectionURL : sectionURLs) 089 { 090 Stream<String> urlStream; 091 092 // It helps to run this, because web-pages can use a lot of strings 093 System.gc(); 094 095 // Starting Scraping the Section for URL's 096 LOG_WRITE(log, "Visiting Section URL: " + sectionURL.toString() + '\n'); 097 098 try 099 { 100 // Download, Scrape & Parse the main-page or section URL. 101 Vector<HTMLNode> sectionPage = HTMLPage.getPageTokens(sectionURL, false); 102 103 104 // If the 'LinksGet' instances is null, then select all URL's on the main-page 105 // section-pge, and pray for rain (hope for the best). If no 'LinksGet' instance 106 // was provided, there will likely be many spurious / irrelevant links to 107 // non-article pages, and even advertisement pages that are also included in this 108 // Stream<String>. 109 // 110 // InnerTagGet returns a Vector<TagNode>. Convert that to a Stream<String>, where 111 // each 'String' in the 'Stream' is the HREF attribute of the <A HREF=...> tag. 112 113 if (linksGetter == null) 114 urlStream = InnerTagGet.all(sectionPage, "a", "href") 115 .stream().map((TagNode tn) -> tn.AV("href")); 116 117 else 118 urlStream = linksGetter.apply(sectionURL, sectionPage).stream(); 119 } 120 121 catch (Exception e) 122 { 123 LOG_WRITE( 124 log, 125 BRED + "Error loading this main-section page-URL\n" + RESET + 126 e.getMessage() + '\n' 127 ); 128 129 if (SKIP_ON_SECTION_URL_EXCEPTION) 130 { 131 LOG_WRITE(log, "Non-fatal Exception, continuing to next Section URL.\n\n"); 132 continue; 133 } 134 135 else 136 { 137 LOG_WRITE( 138 log, 139 BRED + "Fatal - Exiting. Top-Level Section URL's must be valid URL's." + 140 RESET + "\n" + HTTPCodes.convertMessageVerbose(e, sectionURL, 0) + '\n' 141 ); 142 143 throw new SectionURLException 144 ("Invalid Main Section URL: " + sectionURL.toString(), e); 145 } 146 } 147 148 Vector<String> sectionArticleURLs = urlStream 149 150 // If any TagNode's did not have HREF-Attributes, remove those null-values 151 .filter ((String href) -> (href != null)) 152 153 // Perform a Standard String.trim() operation. 154 .map ((String href) -> href.trim()) 155 156 // Any HREF's that are "just white-space" are now removed. 157 .filter ((String href) -> href.length() > 0) 158 159 160 // This removes any HREF Attribute values that begin with 161 // "mailto:" "tel:" "javascript:" "magnet:" etc... 162 163 .filter ((String href) -> StrCmpr.startsWithNAND_CI(href, Links.NON_URL_HREFS())) 164 165 // Now, Resolve any "Partial URL References" 166 .map ((String href) -> Links.resolve_KE(href, sectionURL)) 167 168 169 // If there were any exceptions while performing the Partial-URL Resolve-Operation, 170 // then print an error message. 171 172 .peek ((Ret2<URL, MalformedURLException> r2) -> 173 { 174 if (r2.b != null) LOG_WRITE( 175 log, 176 "Section URL was a malformed URL, and provided exception messsage:\n" + 177 r2.b.getMessage() + '\n' 178 ); 179 }) 180 181 // Convert the Ret2<URL, Exception> to just the URL, without any Exceptions 182 .map ((Ret2<URL, MalformedURLException> r2) -> r2.a) 183 184 // If there was an exception, the URL Ret.a field would be null (remove nulls) 185 .filter ((URL url) -> url != null) 186 187 188 // NOTE: When this evaluates to TRUE - it should be kept 189 // Java Stream's 'filter' method states that when the predicate evaluates to TRUE, 190 // the stream element is KEPT / RETAINED. 191 // 192 // Class URLFilter mimics the filter behavior of Streams.filter(...) 193 194 .filter ((URL url) -> (articleURLFilter == null) || articleURLFilter.test(url)) 195 196 197 // Convert these to "Standard Strings" 198 // Case-Insensitive parts are set to LowerCase 199 // Case Sensitive Parts are left alone. 200 201 .map ((URL url) -> URLs.urlToString(url)) 202 203 204 // Filter any duplicates -> This is the reason for the above case-sensitive parts 205 // being separated. 206 207 .distinct() 208 209 210 // Convert the URL's back to a String. There really should not be any exceptions, 211 // This is just an "extra-careful" step. It is not needed. 212 213 .filter ((String url) -> 214 { try { new URL(url); return true; } catch (Exception e) { return false; } }) 215 216 // Convert the Stream to a Vector 217 .collect(Collectors.toCollection(Vector::new)); 218 219 ret.add(sectionArticleURLs); 220 221 LOG_WRITE( 222 log, 223 "Found [" + BYELLOW + sectionArticleURLs.size() + RESET + "] " + 224 "Article Links.\n\n" 225 ); 226 } 227 228 229 // Provide a simple count to the log output on how many URL's have been uncovered. 230 // NOTE: This does not heed whether different sections contain non-distinct URL's. 231 // (An identical URL found in two different sections will be counted twice!) 232 233 int totalURLs = 0; 234 235 // <?> Prevents the "Xlint:all" from generating warnings... 236 for (Vector<?> section : ret) totalURLs += section.size(); 237 238 LOG_WRITE( 239 log, 240 "Complete Possible Article URL list has: " + 241 BYELLOW + StrPrint.zeroPad10e4(totalURLs) + RESET + ' ' + 242 "url(s).\n\n" 243 ); 244 245 return ret; 246 } 247 248 // This is necessary because Java's 'java.lang.Appendable' permits a IOException 249 private static void LOG_WRITE(Appendable log, String s) 250 { 251 try 252 { log.append(s); } 253 254 catch (Exception e) 255 { 256 System.out.println( 257 "While trying to write to the log, an exception occurred\n" + 258 "the java.lang.Appendable you have provided threw an IOException:\n" + 259 StrIndent.indent(e.getMessage(), 4) + 260 "Unfortunaely, with a faulty Appendable-Log, the scraper cannot continue." 261 ); 262 263 e.printStackTrace(); 264 265 System.out.println("Fatal Error, JVM Exiting..."); 266 267 System.exit(1); 268 } 269 } 270}