001package Torello.HTML;
002
003import Torello.HTML.NodeSearch.*;
004
005import static Torello.Java.C.*;
006
007import Torello.Java.FileRW;
008import Torello.Java.C;
009
010import java.util.*;
011
012import java.util.function.Predicate;
013import java.net.URL;
014import java.io.IOException;
015
016/**
017 * Class for finding ancestor & parent nodes of any selected {@link HTMLNode}.
018 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SURROUNDING>
019 */
020@Torello.JavaDoc.StaticFunctional
021public class Surrounding
022{
023    private Surrounding() { }
024
025    /**
026     * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along
027     * with it's closing element - as a {@code DotPair} - that matches.
028     * 
029     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
030     * 
031     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
032     * Java-Script DOM Tree term).
033     * 
034     * @param htmlTags If this list is empty, we shall look for any ancestor node.  Since this
035     * method returns the first, if this list is left empty, and the index-node is surrounded by
036     * even a bold "{@code <B>...</B>}" then that will be the {@code DotPair} result that is
037     * returned.  If this list is left non-empty, then the only ancestor nodes whose HTML Element
038     * Tag (usually referred to as "the Element") matches a tag from this list shall be returned.
039     *
040     * <BR /><BR /><DIV CLASS=JDHint>
041     * <B>FOR INSTANCE:</B> If {@code "div", "p"}, and {@code "a"} were provided as values to this
042     * parameter - he search loop would skip over all ancestors that were not HTML divider,
043     * paragraph or anchor elements before selecting a result.
044     * </DIV>
045     * 
046     * @return This shall return the first sub-list, as a {@code 'DotPair'} (start &amp; end index
047     * pair). If no matches are found, null will return.  This sublist is nearly identical to the
048     * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are
049     * constructed by this method.
050     * 
051     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
052     * vectorized-html parameter {@code 'html'}
053     * 
054     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
055     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
056     * 
057     * @see #FIRST(Vector, int, HTMLTagCounter)
058     * @see ARGCHECK#index(Vector, int)
059     */
060    public static DotPair first(Vector<? extends HTMLNode> html, int index, String... htmlTags)
061    {
062        return FIRST(
063            html, ARGCHECK.index(html, index),
064            new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.FIRST)
065        );
066    }
067
068    /**
069     * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along
070     * with it's closing element - as a {@code DotPair} - that matches the input-parameter
071     * {@code 'htmlTags'} In this case, the term {@code 'except'} shall mean that any matches whose
072     * HTML Token is among the list in parameter {@code String... htmlTags} will be <B>skipped</B>,
073     * and a "higher-level" ancestor will be returned instead.
074     * 
075     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
076     * 
077     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
078     * Java-Script {@code DOM Tree} term).
079     * 
080     * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search
081     * loop will skip over ancestor nodes that are among the members of this var-args parameter
082     * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search
083     * loop will return the first anestor node identified.
084     * 
085     * <BR /><BR /><DIV CLASS=JDHint>
086     * <B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to this
087     * method, then the search-loop will continue looking for higher-level ancestors - until one
088     * was found that was not an HTML {@code 'bold'} or {@code 'paragraph'} element {@code DotPair}
089     * </DIV>
090     * 
091     * @return This shall return the first sub-list, as a {@code 'DotPair'} (start &amp; end index
092     * pair). If no matches are found, null will return.  This sublist is nearly identical to the
093     * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are
094     * constructed by this method.
095     * 
096     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
097     * vectorized-html parameter {@code 'html'}
098     * 
099     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
100     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
101     * 
102     * @see #FIRST(Vector, int, HTMLTagCounter)
103     * @see ARGCHECK#index(Vector, int)
104     */
105    public static DotPair firstExcept(Vector<? extends HTMLNode> html, int index, String... htmlTags)
106    {
107        return FIRST(
108            html, ARGCHECK.index(html, index),
109            new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.FIRST)
110        );
111    }
112
113    /**
114     * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index.  If
115     * parameter {@code String... htmlTags} is null, all HTML elements will be considered.  If this
116     * parameter contains any elements, then only those elements shall be considered as match in
117     * the ancestor hierarchy tree.
118     * 
119     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
120     * 
121     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
122     * Java-Script {@code DOM Tree} term).
123     * 
124     * @param htmlTags If this list is empty, we shall look for <I><B>all ancestor nodes.</I></B> 
125     * Since this method returns the first ancestor node-pair found, f this list is left non-empty,
126     * then the only ancestor nodes whose HTML Element Tag (usually referred to as "the token") are
127     * members of this varargs {@code String} parameter list shall be considered eligible as a
128     * return result for this method.
129     *
130     * <BR /><BR /><DIV CLASS=JDHint>
131     * <B>FOR INSTANCE:</B> If {@code "DIV", "P"}, and {@code "A"} were listed - the search loop
132     * would skip over all ancestors that were not HTML divider, paragraph or anchor elements
133     * before selecting a result.
134     * </DIV>
135     * 
136     * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'}
137     * (start &amp; end index pair).  If no matches are found, an empty {@code Vector} of
138     * zero-elements shall return.  These sublists are nearly identical to the Java-Script
139     * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are
140     * constructed by this method.
141     * 
142     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
143     * vectorized-html parameter {@code 'html'}
144     * 
145     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of
146     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
147     * 
148     * @see #ALL(Vector, int, HTMLTagCounter)
149     * @see ARGCHECK#index(Vector, int)
150     */
151    public static Vector<DotPair> all(Vector<? extends HTMLNode> html, int index, String... htmlTags)
152    { 
153        return ALL(
154            html, ARGCHECK.index(html, index),
155            new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.ALL)
156        );
157    }
158
159    /**
160     * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index.  If
161     * parameter {@code String... htmlTags} is null, all HTML elements will be considered.  If this
162     * parameter contains any elements, then those elements <B><I>shall not be considered</B></I>
163     * as a match in the ancestor hierarchy tree.
164     * 
165     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
166     * 
167     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
168     * Java-Script {@code DOM Tree} term).
169     * 
170     * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search
171     * loop will skip over ancestor nodes that are among the members of this var-args parameter
172     * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search
173     * loop will return all ancestor nodes of the index node.
174     *
175     * <BR /><BR /><DIV CLASS=JDHint>
176     * <B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to this
177     * method, then the search-loop which is saving all ancestor matches to it's result-set, would
178     * skip over any HTML {@code 'bold'} or {@code 'paragraph'} {@code DotPair's}.
179     * </DIV>
180     * 
181     * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'}
182     * (start &amp; end index pair).  If no matches are found, an empty {@code Vector} of
183     * zero-elements shall return.  These sublists are nearly identical to the Java-Script
184     * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are
185     * constructed by this method.
186     * 
187     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
188     * vectorized-html parameter {@code 'html'}
189     * 
190     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
191     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
192     * 
193     * @see #ALL(Vector, int, HTMLTagCounter)
194     * @see ARGCHECK#index(Vector, int)
195     */
196    public static Vector<DotPair> allExcept
197        (Vector<? extends HTMLNode> html, int index, String... htmlTags)
198    {
199        return ALL(
200            html, ARGCHECK.index(html, index),
201            new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.ALL)
202        );
203    }
204
205
206    // ********************************************************************************************
207    // ********************************************************************************************
208    // FIND INTERNAL METHODS
209    // ********************************************************************************************
210    // ********************************************************************************************
211
212
213    /**
214     * Finds the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding") node pair.
215     * 
216     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
217     * @param index This is any index within the bounds of the {@code 'html'} parameter.
218     * @param tagCounter Any internally used counter, to optimize the search routine.
219     * 
220     * @return The matching <B STYLE="color: red;">ancestor</B> node's start-and-end index as a 
221     * {@code 'DotPair'}.
222     * 
223     * @see TagNode
224     * @see HTMLNode
225     * @see DotPair
226     * @see DotPair#isInside(int)
227     * @see Util.Inclusive#dotPairOPT(Vector, int, int)
228     */
229    protected static DotPair FIRST
230        (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter)
231    {
232        int     size = html.size();
233        TagNode tn;
234        DotPair ret;
235
236        for (   int i=(index-1);
237                (i >= 0) && (! tagCounter.allBanned());
238                i--
239        )
240
241            if (    ((tn = html.elementAt(i).openTag()) != null)
242                &&  tagCounter.check(tn)
243                &&  ((ret = Util.Inclusive.dotPairOPT(html, i, size)) != null)
244                &&  ret.isInside(index)
245                    // isInside(...) Should never fail, but 
246            )       // This guarantees to prevent erroneous answers
247
248                // If there is a match, return that match, and exit immediately.
249                return ret;
250
251        return null;
252    }
253
254    /**
255     * Finds all <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding"} node pairs.
256     * 
257     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
258     * @param index This is any index within the bounds of the {@code 'html'} parameter.
259     * @param tagCounter Any internally used counter, to optimize the search routine.
260     * 
261     * @return All matching <B STYLE="color: red;">ancestor</B> nodes' start-and-end index pairs
262     * inside a {@code Vector<DotPair>}
263     * 
264     * @see TagNode
265     * @see HTMLNode
266     * @see DotPair
267     * @see DotPair#isInside(int)
268     * @see Util.Inclusive#dotPairOPT(Vector, int, int)
269     */
270    protected static Vector<DotPair> ALL
271        (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter)
272    {
273        HTMLNode n;     TagNode tn;     DotPair dp;     int size = html.size();
274        Vector<DotPair> ret = new Vector<>();
275
276        for (int i=(index-1); (i >= 0) && (! tagCounter.allBanned()); i--)
277
278            if (    (n = html.elementAt(i)).isTagNode()
279                &&  tagCounter.check(tn = (TagNode) n)
280            )
281            {
282                if (    ((dp = Util.Inclusive.dotPairOPT(html, i, size)) != null)
283                    &&  dp.isInside(index)
284                )           // isInside(...) Should never fail, but 
285                            // This guarantees to prevent erroneous answers
286                    ret.addElement(dp);
287
288                else
289                    // If finding a token match fails, just ignore that token from now on...
290                    tagCounter.reportFailed(tn.tok);
291
292            }
293
294        return ret;
295    }
296
297
298    // ********************************************************************************************
299    // ********************************************************************************************
300    // Tester, leave it here!  It's not doing you no harm.
301    // ********************************************************************************************
302    // ********************************************************************************************
303
304
305    static void test(String urlStr, String fileName) throws IOException
306    {
307        // String url = "http://developer.torello.directory/JavaHTML/Version%201/1.4/javadoc/" +
308        //      "Torello/HTML/NodeSearch/CommentNodeCount.html";
309    
310        StringBuilder       sb      = new StringBuilder();
311        URL                 url     = new URL(urlStr);
312        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(url, false);
313
314        int     pos = TextNodeFind.first(page, TextComparitor.CN_CI, "a count of how many");
315        DotPair dp  = Surrounding.firstExcept(page, pos, "li", "body", "div");
316
317        sb.append("Text Node Found: [" + page.elementAt(pos) + "]\n");
318        sb.append("Index Found: " + pos + ", DotPair Found: " + dp.toString() + "\n");
319        sb.append(Debug.print(page, dp, Debug::J) + "\n");
320
321        Vector<DotPair> allDP = Surrounding.allExcept(page, pos, "body", "html", "div");
322
323        for (DotPair l : allDP) sb.append(
324            BCYAN + 
325            "************************************************************\n" +
326            "************************************************************\n" + RESET +
327            "Index Found: " + pos + ", DotPair Found: " + l.toString() + "\n" +
328            "Starting Node: " + BRED + page.elementAt(l.start).str + RESET + "\n" +
329            "Ending Node:" + BRED + page.elementAt(l.end).str + RESET + "\n"
330        );
331
332        String s = sb.toString();
333        System.out.println(s);
334
335        if (fileName != null)
336            FileRW.writeFile(C.toHTML(s.replace("<", "&lt;").replace(">", "&gt;")), fileName);
337    }
338}