Source code

001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.*;
004import Torello.Java.*;
005
006import Torello.HTML.Tools.Images.ImageScraper;
007import Torello.HTML.Tools.Images.Request;
008
009import static Torello.Java.C.*;
010
011import java.util.*;
012import java.io.*;
013import java.util.regex.*;
014
015import java.net.URL;
016import java.util.concurrent.TimeUnit;
017
018/**
019 * Converts Serialized Object Files of HTML-Vectors into <CODE>'&#46;html'</CODE> Files, and can
020 * also be used to do any user-defined, customized post-processing (using a function-pointer) on
021 * news-articles (after downloading them).
022 * 
023 * <EMBED CLASS='external-html' DATA-FILE-ID=TO_HTML>
024 */
025@Torello.JavaDoc.StaticFunctional
026public class ToHTML 
027{
028    private ToHTML() { }
029
030    private static final Pattern P1 = Pattern.compile(".*?(\\d{3,}).(\\d{3,}).*");
031
032    /**
033     * <EMBED CLASS='external-html' DATA-FILE-ID=TH_DESCRIPTION>
034     * 
035     * @param inputDir This parameter should contain the name of the directory that was used with
036     * method {@code download(...)} from {@code ScrapeArticle's}.  This directory must exist and it
037     * must contain the files that were saved.
038     *
039     * @param outputDir This parameter should contain the name of the directory where the expanded
040     * and de-serialized {@code '.html'} files will be stored, along with their downloaded images.
041     *
042     * @param cleanIt           <EMBED CLASS='external-html' DATA-FILE-ID=TH_CLEAN_IT>
043     * @param modifyOrRetrieve  <EMBED CLASS='external-html' DATA-FILE-ID=TH_MOD_OR_RETRIEVE>
044     * 
045     * @param log Output text is sent to this log.  This parameter may be null, and if it is, it
046     * shall be ignored.  If this program is running on UNIX, color-codes will be included in the
047     * log data.
048     * 
049     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
050     *
051     * @throws IOException If there any I/O Exceptions when writing image files to the file-system,
052     * then this exception will throw.
053     */
054    @SuppressWarnings("unchecked")
055    public static void convert(
056        final String        inputDir,
057              String        outputDir,
058        final boolean       cleanIt,
059        final HTMLModifier  modifyOrRetrieve,
060        final Appendable    log
061    )
062        throws IOException
063    {
064        if (log !=null) log.append(
065            "\n" + BRED +
066            "*****************************************************************************************\n" +
067            "*****************************************************************************************\n" + 
068            RESET + " Converting Vector<HTMLNode> to '.html' files, and downloading Pictures." + BRED + "\n" +
069            "*****************************************************************************************\n" +
070            "*****************************************************************************************\n" + 
071            RESET + '\n'
072        );
073
074        if (! outputDir.endsWith(File.separator)) outputDir = outputDir + File.separator;
075
076        // Uses the FileNode class to build an iterator of all '.dat' files that are found in the
077        // 'inputDir' directory-parameter.
078
079        final Iterator<FileNode> iter = FileNode
080            .createRoot(inputDir)
081            .loadTree()
082            .getDirContentsFiles
083                (RTC.ITERATOR(), (FileNode fn) -> fn.name.endsWith(".dat"));
084
085        // Iterate through each of the data-files.
086        while (iter.hasNext())
087            try
088            {
089                // Retrieve next article, using the iterator
090                final FileNode fn = iter.next();
091
092                // Load the instance of 'Article' into memory, using Object De-Serialization
093                Article page = FileRW.readObjectFromFileNOCNFE(fn.toString(), Article.class, true);
094
095                // If there are customized modifications to the page (or retrieval operations)
096                // that were requested, they are done here.
097
098                if (modifyOrRetrieve != null)
099                {
100                    // Retrieves the section-number and article-number from file-name
101                    Matcher m = P1.matcher(fn.toString());
102
103                    // These will be set to -1, and if the directoryName/fileName did not use the
104                    // standard "factory-generated" file-save, then these will STILL BE -1 when
105                    // passed to the modifier lambda.
106
107                    int sectionNum = -1;
108                    int articleNum = -1;
109
110                    if (m.find())
111                    {
112                        sectionNum = Integer.parseInt(m.group(1));
113                        articleNum = Integer.parseInt(m.group(2));
114                    }
115
116                    // pass the articleBody (and it's URL and filename) to the customized
117                    // HTML Modifier provided by the user who called this method
118
119                    modifyOrRetrieve.modifyOrRetrieve
120                        (page.articleBody, page.url, sectionNum, articleNum);
121                }
122
123                // We need to build a "Sub-Directory" name for the HTML page where the download
124                // images will be stored
125
126                int     dotPos      = fn.name.lastIndexOf(".");
127                String  outDirName  = outputDir + fn.name.substring(0, dotPos).replace("\\.", "/") + '/';
128
129                // Make sure the subdirectory exists.
130                new File(outDirName).mkdirs();
131
132                // This process may be skipped, but it makes the output HTML much cleaner and more
133                // readable for most Internet News Web-Sites.  Both <SCRIPT>, <!-- --> elements are
134                // removed.  Also, any "class" or "id" fields are eliminated.  This "cleaning" can
135                // be easily skipped
136
137                if (cleanIt)
138                {
139                    Util.Remove.scriptNodeBlocks(page.articleBody);
140                    Util.Remove.styleNodeBlocks(page.articleBody);
141                    Util.Remove.allCommentNodes(page.articleBody);
142                    Attributes.remove(page.articleBody, "class", "id");
143                }
144
145                if (log != null) log.append("Writing Page: " + BGREEN + fn.name + RESET + '\n');
146
147                // 'Localize' any images available.  'localizing' an HTML web-page means downloading
148                // the image data, and saving it to disk.
149
150                ImageScraper.localizeImages(page.articleBody, page.url, log, outDirName);
151
152                // If there were any images available, they were downloaded and localized.  The
153                // Write the (updated) HTML to an '.html' text-file.
154
155                FileRW.writeFile(Util.pageToString(page.articleBody), outDirName + "index.html");
156            }
157
158            // NOTE: The "ImageScraper" spawns a (very) small "monitor thread" that ensures that
159            // downloading does not "hang" the system by aborting image-downloads that take longer
160            // than 10 seconds.  It is necessary to shut-down these threads on system exit, because
161            // if they are not shutdown, when a java program terminates, the operating system that
162            // the program is using (the terminal window) will appear to "hang" or "freeze" until
163            // the extra-thread is shut-down by the JVM.  This delay can be upwards of 30 seconds.
164
165            catch (IOException ioe)
166                { ImageScraper.shutdownTOThreads(); throw ioe; }
167
168            catch (Exception e)
169            {
170                ImageScraper.shutdownTOThreads();
171
172                throw new IOException(
173                    "There was a problem converting the html pages.  See exception.getCause() " +
174                    "for more details.",
175                    e
176                );
177            }
178
179        // Exit the method.  Again, shutdown the Time-Out "monitor" thread.
180        ImageScraper.shutdownTOThreads();
181    }
182}