001package Torello.HTML.Tools.NewsSite; 002 003import Torello.HTML.HTMLPageMWT; 004import Torello.Java.StrFilter; 005 006import Torello.Java.Additional.Ret4; 007 008import static Torello.Java.C.RESET; 009import static Torello.Java.C.BRED; 010import static Torello.Java.C.GREEN; 011 012import java.util.Vector; 013import java.io.IOException; 014 015import java.net.URL; 016 017/** 018 * This class runs the primary iteration-loop for downloading news-articles using a list of 019 * article-{@code URL's}. 020 * 021 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_ARTICLES> 022 */ 023@Torello.JavaDoc.StaticFunctional 024public class ScrapeArticles 025{ 026 private ScrapeArticles() { } 027 028 private static final String STARS = 029 "*********************************************" + 030 "********************************************\n"; 031 032 /** 033 * This is used to do the downloading of newspaper articles. 034 * @param articleReceiver <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_REC> 035 * @param articleURLs <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_URLS> 036 * @param articleGetter <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_GETTER> 037 * @param skipArticlesWithoutPhotos <EMBED CLASS='external-html' DATA-FILE-ID=SA_SKIP_NO_PHOTOS> 038 * @param bannerAndAdFinder <EMBED CLASS='external-html' DATA-FILE-ID=SA_BANNER_ADS> 039 * @param keepOriginalPageHTML <EMBED CLASS='external-html' DATA-FILE-ID=SA_KEEP_HTML_BOOL> 040 * 041 * @param pause If there are many / numerous articles to download, pass an instance of 042 * {@code class Pause}, and intermediate progress can be saved, and reloaded at a later time. 043 * 044 * @param log This parameter may not be null, or a {@code NullPointerException} shall throw. 045 * As articles are downloaded, notices shall be posted to this {@code 'log'} by this method. 046 * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE> 047 * 048 * @return <EMBED CLASS='external-html' DATA-FILE-ID=SA_RETURNS> 049 * @throws PauseException If there is an error when attempting to save the download state. 050 * @throws ReceiveException <EMBED CLASS='external-html' DATA-FILE-ID=SA_RECEIVE_EX> 051 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=SA_IO_EX> 052 */ 053 public static Vector<Vector<DownloadResult>> download( 054 final ScrapedArticleReceiver articleReceiver, 055 final Vector<Vector<String>> articleURLs, 056 final ArticleGet articleGetter, 057 final boolean skipArticlesWithoutPhotos, 058 final StrFilter bannerAndAdFinder, 059 final boolean keepOriginalPageHTML, 060 final Pause pause, 061 final Appendable log 062 ) 063 throws PauseException, ReceiveException, IOException 064 { 065 final RECORD r = new RECORD( 066 articleReceiver, 067 articleURLs, 068 articleGetter, 069 skipArticlesWithoutPhotos, 070 bannerAndAdFinder, 071 keepOriginalPageHTML, 072 pause, 073 log 074 ); 075 076 log.append( 077 "\n" + BRED + STARS + STARS + 078 RESET + " Downloading Articles" + BRED + "\n" + 079 STARS + STARS + RESET + '\n' 080 ); 081 082 // If the user has passed an instance of 'pause' then it should be loaded from disk. 083 if (pause != null) 084 { 085 final Ret4<Vector<Vector<DownloadResult>>, Integer, Integer, Integer> r4 = 086 pause.loadState(); 087 088 r.ret.addAll(r4.a); 089 r.setCounters(r4); 090 } 091 092 093 // If the user did not provide a "Pause" mechanism, **OR** the "Pause Mechanism" asserts 094 // that the download process is starting from the beginning of the article-URL Vector, 095 // THEN a *new vector* should be built. 096 // 097 // Initializes the capacity (sizes) of the two-dimensional "Return Vector." 098 // 099 // NOTE: The return Vector is exactly parallel to the input "articleURLs" 100 // two-dimensional input Vector. 101 102 if ((pause == null) || r.countersAllZero()) 103 for (int i=0; i < articleURLs.size(); i++) 104 r.ret.add(new Vector<DownloadResult>(articleURLs.elementAt(i).size())); 105 106 107 for (; r.outerLoopTest(); r.incOuterCounter()) 108 109 for (r.initInnerCounter(); r.innerLoopTest(); r.incInnerCounter()) 110 111 try 112 { loopBody(r); } 113 114 catch (ReceiveException re) 115 { HandleExceptions.receiverException(r, re); } 116 117 catch (IOException ioe) 118 { HandleExceptions.ioException(r, ioe); } 119 120 catch (Exception e) 121 { HandleExceptions.exception(r, e); } 122 123 finally 124 { 125 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 126 // Write the current "READ STATE" information (two integers) 127 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 128 // 129 // This makes sure that the download-progress is not lost when large numbers 130 // of articles are being processed. Restart the download, and the loop 131 // variables will automatically be initialized to where they were before the 132 // JVM exited. (Pretty Useful) 133 134 if (pause != null) pause.saveState 135 (r.ret, r.outerCounter(), r.innerCounter(), r.successCounter()); 136 } 137 138 139 log.append( 140 BRED + STARS + RESET + 141 "Traversing Site Completed.\n" + 142 "Loaded a total of (" + r.successCounter() + ") articles.\n" 143 ); 144 145 146 // Returns the two-dimensional "Download Result" Vector 147 // Make sure to stop the "Max Wait Time Threads" 148 149 HTMLPageMWT.shutdownMWTThreads(); 150 151 return r.ret; 152 } 153 154 155 private static void loopBody(final RECORD r) 156 throws IOException, ReceiveException 157 { 158 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 159 // Download one article 160 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 161 162 r.urlStr = r.articleURLs 163 .elementAt(r.outerCounter()) 164 .elementAt(r.innerCounter()); 165 166 // Generate the URL-Instance, call the Garbage-Collector, if needed 167 boolean success = URL_GC.run(r); 168 if (! success) return; 169 170 // Scrape the Web-Page, Verify that the page has non-Empty HTML 171 success = ScrapePageAndVerify.run(r); 172 if (! success) return; 173 174 // Retrieve the <TITLE> element (as a String) from the page - if it has one. 175 ScrapeTitle.run(r); 176 177 // Extract the Article and Verify non-Empty Results 178 success = ExtractArticleAndVerify.run(r); 179 if (! success) return; 180 181 // Extract the Image-Locations, Verify Non-Empty Results 182 success = GetImageLocations.run(r); 183 if (! success) return; 184 185 // Skip "Banner Images" 186 success = SkipBannerImages.run(r); 187 if (! success) return; 188 189 190 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 191 // Output the Results 192 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 193 194 final Article articleResult = new Article( 195 r.url, r.title, (r.keepOriginalPageHTML ? r.page : null), r.article, 196 r.imageURLs, r.imagePosArr 197 ); 198 199 200 // The article was successfully downloaded and parsed. Send it to the 201 // "Receiver" and add DownloadResult to the return vector. 202 203 r.log.append( 204 GREEN + "ARTICLE LOADED." + RESET + 205 " Sending to ScrapedArticleReceiver.\n" 206 ); 207 208 r.articleReceiver.receive(articleResult, r.outerCounter(), r.innerCounter()); 209 210 r.ret 211 .elementAt(r.outerCounter()) 212 .add(DownloadResult.SUCCESS); 213 214 r.incSuccessCounter(); 215 } 216 217}