ToHTML.java.html

package Torello.HTML.Tools.NewsSite;

import Torello.HTML.*;
import Torello.Java.*;

import Torello.HTML.Tools.Images.ImageScraper;
import Torello.HTML.Tools.Images.Request;

import static Torello.Java.C.*;

import java.util.*;
import java.io.*;
import java.util.regex.*;

import java.net.URL;
import java.util.concurrent.TimeUnit;

/**
 * Converts Serialized Object Files of HTML-Vectors into <CODE>'&#46;html'</CODE> Files, and can
 * also be used to do any user-defined, customized post-processing (using a function-pointer) on
 * news-articles (after downloading them).
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=TO_HTML>
 */
@Torello.JavaDoc.StaticFunctional
public class ToHTML 
{
    private ToHTML() { }

    private static final Pattern P1 = Pattern.compile(".*?(\\d{3,}).(\\d{3,}).*");

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=TH_DESCRIPTION>
     * 
     * @param inputDir This parameter should contain the name of the directory that was used with
     * method {@code download(...)} from {@code ScrapeArticle's}.  This directory must exist and it
     * must contain the files that were saved.
     *
     * @param outputDir This parameter should contain the name of the directory where the expanded
     * and de-serialized {@code '.html'} files will be stored, along with their downloaded images.
     *
     * @param cleanIt           <EMBED CLASS='external-html' DATA-FILE-ID=TH_CLEAN_IT>
     * @param modifyOrRetrieve  <EMBED CLASS='external-html' DATA-FILE-ID=TH_MOD_OR_RETRIEVE>
     * 
     * @param log Output text is sent to this log.  This parameter may be null, and if it is, it
     * shall be ignored.  If this program is running on UNIX, color-codes will be included in the
     * log data.
     * 
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @throws IOException If there any I/O Exceptions when writing image files to the file-system,
     * then this exception will throw.
     */
    @SuppressWarnings("unchecked")
    public static void convert(
        final String        inputDir,
              String        outputDir,
        final boolean       cleanIt,
        final HTMLModifier  modifyOrRetrieve,
        final Appendable    log
    )
        throws IOException
    {
        if (log !=null) log.append(
            "\n" + BRED +
            "*****************************************************************************************\n" +
            "*****************************************************************************************\n" + 
            RESET + " Converting Vector<HTMLNode> to '.html' files, and downloading Pictures." + BRED + "\n" +
            "*****************************************************************************************\n" +
            "*****************************************************************************************\n" + 
            RESET + '\n'
        );

        if (! outputDir.endsWith(File.separator)) outputDir = outputDir + File.separator;

        // Uses the FileNode class to build an iterator of all '.dat' files that are found in the
        // 'inputDir' directory-parameter.

        final Iterator<FileNode> iter = FileNode
            .createRoot(inputDir)
            .loadTree()
            .getDirContentsFiles
                (RTC.ITERATOR(), (FileNode fn) -> fn.name.endsWith(".dat"));

        // Iterate through each of the data-files.
        while (iter.hasNext())
            try
            {
                // Retrieve next article, using the iterator
                final FileNode fn = iter.next();

                // Load the instance of 'Article' into memory, using Object De-Serialization
                Article page = FileRW.readObjectFromFileNOCNFE(fn.toString(), Article.class, true);

                // If there are customized modifications to the page (or retrieval operations)
                // that were requested, they are done here.

                if (modifyOrRetrieve != null)
                {
                    // Retrieves the section-number and article-number from file-name
                    Matcher m = P1.matcher(fn.toString());

                    // These will be set to -1, and if the directoryName/fileName did not use the
                    // standard "factory-generated" file-save, then these will STILL BE -1 when
                    // passed to the modifier lambda.

                    int sectionNum = -1;
                    int articleNum = -1;

                    if (m.find())
                    {
                        sectionNum = Integer.parseInt(m.group(1));
                        articleNum = Integer.parseInt(m.group(2));
                    }

                    // pass the articleBody (and it's URL and filename) to the customized
                    // HTML Modifier provided by the user who called this method

                    modifyOrRetrieve.modifyOrRetrieve
                        (page.articleBody, page.url, sectionNum, articleNum);
                }

                // We need to build a "Sub-Directory" name for the HTML page where the download
                // images will be stored

                int     dotPos      = fn.name.lastIndexOf(".");
                String  outDirName  = outputDir + fn.name.substring(0, dotPos).replace("\\.", "/") + '/';

                // Make sure the subdirectory exists.
                new File(outDirName).mkdirs();

                // This process may be skipped, but it makes the output HTML much cleaner and more
                // readable for most Internet News Web-Sites.  Both <SCRIPT>, <!-- --> elements are
                // removed.  Also, any "class" or "id" fields are eliminated.  This "cleaning" can
                // be easily skipped

                if (cleanIt)
                {
                    Util.Remove.scriptNodeBlocks(page.articleBody);
                    Util.Remove.styleNodeBlocks(page.articleBody);
                    Util.Remove.allCommentNodes(page.articleBody);
                    Attributes.remove(page.articleBody, "class", "id");
                }

                if (log != null) log.append("Writing Page: " + BGREEN + fn.name + RESET + '\n');

                // 'Localize' any images available.  'localizing' an HTML web-page means downloading
                // the image data, and saving it to disk.

                ImageScraper.localizeImages(page.articleBody, page.url, log, outDirName);

                // If there were any images available, they were downloaded and localized.  The
                // Write the (updated) HTML to an '.html' text-file.

                FileRW.writeFile(Util.pageToString(page.articleBody), outDirName + "index.html");
            }

            // NOTE: The "ImageScraper" spawns a (very) small "monitor thread" that ensures that
            // downloading does not "hang" the system by aborting image-downloads that take longer
            // than 10 seconds.  It is necessary to shut-down these threads on system exit, because
            // if they are not shutdown, when a java program terminates, the operating system that
            // the program is using (the terminal window) will appear to "hang" or "freeze" until
            // the extra-thread is shut-down by the JVM.  This delay can be upwards of 30 seconds.

            catch (IOException ioe)
                { ImageScraper.shutdownTOThreads(); throw ioe; }

            catch (Exception e)
            {
                ImageScraper.shutdownTOThreads();

                throw new IOException(
                    "There was a problem converting the html pages.  See exception.getCause() " +
                    "for more details.",
                    e
                );
            }

        // Exit the method.  Again, shutdown the Time-Out "monitor" thread.
        ImageScraper.shutdownTOThreads();
    }
}