001package Torello.HTML.Tools.NewsSite; 002 003import Torello.HTML.*; 004import Torello.Java.*; 005 006import Torello.HTML.Tools.Images.ImageScraper; 007import Torello.HTML.Tools.Images.Request; 008 009import static Torello.Java.C.*; 010 011import java.util.*; 012import java.io.*; 013import java.util.regex.*; 014 015import java.net.URL; 016import java.util.concurrent.TimeUnit; 017 018/** 019 * Converts Serialized Object Files of HTML-Vectors into <CODE>'.html'</CODE> Files, and can 020 * also be used to do any user-defined, customized post-processing (using a function-pointer) on 021 * news-articles (after downloading them). 022 * 023 * <EMBED CLASS='external-html' DATA-FILE-ID=TO_HTML> 024 */ 025@Torello.JavaDoc.StaticFunctional 026public class ToHTML 027{ 028 private ToHTML() { } 029 030 private static final Pattern P1 = Pattern.compile(".*?(\\d{3,}).(\\d{3,}).*"); 031 032 /** 033 * <EMBED CLASS='external-html' DATA-FILE-ID=TH_DESCRIPTION> 034 * 035 * @param inputDir This parameter should contain the name of the directory that was used with 036 * method {@code download(...)} from {@code ScrapeArticle's}. This directory must exist and it 037 * must contain the files that were saved. 038 * 039 * @param outputDir This parameter should contain the name of the directory where the expanded 040 * and de-serialized {@code '.html'} files will be stored, along with their downloaded images. 041 * 042 * @param cleanIt <EMBED CLASS='external-html' DATA-FILE-ID=TH_CLEAN_IT> 043 * @param modifyOrRetrieve <EMBED CLASS='external-html' DATA-FILE-ID=TH_MOD_OR_RETRIEVE> 044 * 045 * @param log Output text is sent to this log. This parameter may be null, and if it is, it 046 * shall be ignored. If this program is running on UNIX, color-codes will be included in the 047 * log data. 048 * 049 * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE> 050 * 051 * @throws IOException If there any I/O Exceptions when writing image files to the file-system, 052 * then this exception will throw. 053 */ 054 @SuppressWarnings("unchecked") 055 public static void convert( 056 final String inputDir, 057 String outputDir, 058 final boolean cleanIt, 059 final HTMLModifier modifyOrRetrieve, 060 final Appendable log 061 ) 062 throws IOException 063 { 064 if (log !=null) log.append( 065 "\n" + BRED + 066 "*****************************************************************************************\n" + 067 "*****************************************************************************************\n" + 068 RESET + " Converting Vector<HTMLNode> to '.html' files, and downloading Pictures." + BRED + "\n" + 069 "*****************************************************************************************\n" + 070 "*****************************************************************************************\n" + 071 RESET + '\n' 072 ); 073 074 if (! outputDir.endsWith(File.separator)) outputDir = outputDir + File.separator; 075 076 // Uses the FileNode class to build an iterator of all '.dat' files that are found in the 077 // 'inputDir' directory-parameter. 078 079 final Iterator<FileNode> iter = FileNode 080 .createRoot(inputDir) 081 .loadTree() 082 .getDirContentsFiles 083 (RTC.ITERATOR(), (FileNode fn) -> fn.name.endsWith(".dat")); 084 085 // Iterate through each of the data-files. 086 while (iter.hasNext()) 087 try 088 { 089 // Retrieve next article, using the iterator 090 final FileNode fn = iter.next(); 091 092 // Load the instance of 'Article' into memory, using Object De-Serialization 093 Article page = FileRW.readObjectFromFileNOCNFE(fn.toString(), Article.class, true); 094 095 // If there are customized modifications to the page (or retrieval operations) 096 // that were requested, they are done here. 097 098 if (modifyOrRetrieve != null) 099 { 100 // Retrieves the section-number and article-number from file-name 101 Matcher m = P1.matcher(fn.toString()); 102 103 // These will be set to -1, and if the directoryName/fileName did not use the 104 // standard "factory-generated" file-save, then these will STILL BE -1 when 105 // passed to the modifier lambda. 106 107 int sectionNum = -1; 108 int articleNum = -1; 109 110 if (m.find()) 111 { 112 sectionNum = Integer.parseInt(m.group(1)); 113 articleNum = Integer.parseInt(m.group(2)); 114 } 115 116 // pass the articleBody (and it's URL and filename) to the customized 117 // HTML Modifier provided by the user who called this method 118 119 modifyOrRetrieve.modifyOrRetrieve 120 (page.articleBody, page.url, sectionNum, articleNum); 121 } 122 123 // We need to build a "Sub-Directory" name for the HTML page where the download 124 // images will be stored 125 126 int dotPos = fn.name.lastIndexOf("."); 127 String outDirName = outputDir + fn.name.substring(0, dotPos).replace("\\.", "/") + '/'; 128 129 // Make sure the subdirectory exists. 130 new File(outDirName).mkdirs(); 131 132 // This process may be skipped, but it makes the output HTML much cleaner and more 133 // readable for most Internet News Web-Sites. Both <SCRIPT>, <!-- --> elements are 134 // removed. Also, any "class" or "id" fields are eliminated. This "cleaning" can 135 // be easily skipped 136 137 if (cleanIt) 138 { 139 Util.Remove.scriptNodeBlocks(page.articleBody); 140 Util.Remove.styleNodeBlocks(page.articleBody); 141 Util.Remove.allCommentNodes(page.articleBody); 142 Attributes.remove(page.articleBody, "class", "id"); 143 } 144 145 if (log != null) log.append("Writing Page: " + BGREEN + fn.name + RESET + '\n'); 146 147 // 'Localize' any images available. 'localizing' an HTML web-page means downloading 148 // the image data, and saving it to disk. 149 150 ImageScraper.localizeImages(page.articleBody, page.url, log, outDirName); 151 152 // If there were any images available, they were downloaded and localized. The 153 // Write the (updated) HTML to an '.html' text-file. 154 155 FileRW.writeFile(Util.pageToString(page.articleBody), outDirName + "index.html"); 156 } 157 158 // NOTE: The "ImageScraper" spawns a (very) small "monitor thread" that ensures that 159 // downloading does not "hang" the system by aborting image-downloads that take longer 160 // than 10 seconds. It is necessary to shut-down these threads on system exit, because 161 // if they are not shutdown, when a java program terminates, the operating system that 162 // the program is using (the terminal window) will appear to "hang" or "freeze" until 163 // the extra-thread is shut-down by the JVM. This delay can be upwards of 30 seconds. 164 165 catch (IOException ioe) 166 { ImageScraper.shutdownTOThreads(); throw ioe; } 167 168 catch (Exception e) 169 { 170 ImageScraper.shutdownTOThreads(); 171 172 throw new IOException( 173 "There was a problem converting the html pages. See exception.getCause() " + 174 "for more details.", 175 e 176 ); 177 } 178 179 // Exit the method. Again, shutdown the Time-Out "monitor" thread. 180 ImageScraper.shutdownTOThreads(); 181 } 182}