001package Torello.HTML; 002 003import Torello.HTML.NodeSearch.*; 004 005import static Torello.Java.C.*; 006 007import Torello.Java.FileRW; 008import Torello.Java.C; 009 010import java.util.*; 011 012import java.util.function.Predicate; 013import java.net.URL; 014import java.io.IOException; 015 016/** 017 * Class for finding ancestor & parent nodes of any selected {@link HTMLNode}. 018 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SURROUNDING> 019 */ 020@Torello.JavaDoc.StaticFunctional 021public class Surrounding 022{ 023 private Surrounding() { } 024 025 /** 026 * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along 027 * with it's closing element - as a {@code DotPair} - that matches. 028 * 029 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 030 * 031 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 032 * Java-Script DOM Tree term). 033 * 034 * @param htmlTags If this list is empty, we shall look for any ancestor node. Since this 035 * method returns the first, if this list is left empty, and the index-node is surrounded by 036 * even a bold "{@code <B>...</B>}" then that will be the {@code DotPair} result that is 037 * returned. If this list is left non-empty, then the only ancestor nodes whose HTML Element 038 * Tag (usually referred to as "the Element") matches a tag from this list shall be returned. 039 * 040 * <BR /><BR /><DIV CLASS=JDHint> 041 * <B>FOR INSTANCE:</B> If {@code "div", "p"}, and {@code "a"} were provided as values to this 042 * parameter - he search loop would skip over all ancestors that were not HTML divider, 043 * paragraph or anchor elements before selecting a result. 044 * </DIV> 045 * 046 * @return This shall return the first sub-list, as a {@code 'DotPair'} (start & end index 047 * pair). If no matches are found, null will return. This sublist is nearly identical to the 048 * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are 049 * constructed by this method. 050 * 051 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 052 * vectorized-html parameter {@code 'html'} 053 * 054 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 055 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 056 * 057 * @see #FIRST(Vector, int, HTMLTagCounter) 058 * @see ARGCHECK#index(Vector, int) 059 */ 060 public static DotPair first(Vector<? extends HTMLNode> html, int index, String... htmlTags) 061 { 062 return FIRST( 063 html, ARGCHECK.index(html, index), 064 new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.FIRST) 065 ); 066 } 067 068 /** 069 * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along 070 * with it's closing element - as a {@code DotPair} - that matches the input-parameter 071 * {@code 'htmlTags'} In this case, the term {@code 'except'} shall mean that any matches whose 072 * HTML Token is among the list in parameter {@code String... htmlTags} will be <B>skipped</B>, 073 * and a "higher-level" ancestor will be returned instead. 074 * 075 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 076 * 077 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 078 * Java-Script {@code DOM Tree} term). 079 * 080 * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search 081 * loop will skip over ancestor nodes that are among the members of this var-args parameter 082 * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search 083 * loop will return the first anestor node identified. 084 * 085 * <BR /><BR /><DIV CLASS=JDHint> 086 * <B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to this 087 * method, then the search-loop will continue looking for higher-level ancestors - until one 088 * was found that was not an HTML {@code 'bold'} or {@code 'paragraph'} element {@code DotPair} 089 * </DIV> 090 * 091 * @return This shall return the first sub-list, as a {@code 'DotPair'} (start & end index 092 * pair). If no matches are found, null will return. This sublist is nearly identical to the 093 * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are 094 * constructed by this method. 095 * 096 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 097 * vectorized-html parameter {@code 'html'} 098 * 099 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 100 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 101 * 102 * @see #FIRST(Vector, int, HTMLTagCounter) 103 * @see ARGCHECK#index(Vector, int) 104 */ 105 public static DotPair firstExcept(Vector<? extends HTMLNode> html, int index, String... htmlTags) 106 { 107 return FIRST( 108 html, ARGCHECK.index(html, index), 109 new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.FIRST) 110 ); 111 } 112 113 /** 114 * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index. If 115 * parameter {@code String... htmlTags} is null, all HTML elements will be considered. If this 116 * parameter contains any elements, then only those elements shall be considered as match in 117 * the ancestor hierarchy tree. 118 * 119 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 120 * 121 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 122 * Java-Script {@code DOM Tree} term). 123 * 124 * @param htmlTags If this list is empty, we shall look for <I><B>all ancestor nodes.</I></B> 125 * Since this method returns the first ancestor node-pair found, f this list is left non-empty, 126 * then the only ancestor nodes whose HTML Element Tag (usually referred to as "the token") are 127 * members of this varargs {@code String} parameter list shall be considered eligible as a 128 * return result for this method. 129 * 130 * <BR /><BR /><DIV CLASS=JDHint> 131 * <B>FOR INSTANCE:</B> If {@code "DIV", "P"}, and {@code "A"} were listed - the search loop 132 * would skip over all ancestors that were not HTML divider, paragraph or anchor elements 133 * before selecting a result. 134 * </DIV> 135 * 136 * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'} 137 * (start & end index pair). If no matches are found, an empty {@code Vector} of 138 * zero-elements shall return. These sublists are nearly identical to the Java-Script 139 * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are 140 * constructed by this method. 141 * 142 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 143 * vectorized-html parameter {@code 'html'} 144 * 145 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 146 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 147 * 148 * @see #ALL(Vector, int, HTMLTagCounter) 149 * @see ARGCHECK#index(Vector, int) 150 */ 151 public static Vector<DotPair> all(Vector<? extends HTMLNode> html, int index, String... htmlTags) 152 { 153 return ALL( 154 html, ARGCHECK.index(html, index), 155 new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.ALL) 156 ); 157 } 158 159 /** 160 * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index. If 161 * parameter {@code String... htmlTags} is null, all HTML elements will be considered. If this 162 * parameter contains any elements, then those elements <B><I>shall not be considered</B></I> 163 * as a match in the ancestor hierarchy tree. 164 * 165 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 166 * 167 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 168 * Java-Script {@code DOM Tree} term). 169 * 170 * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search 171 * loop will skip over ancestor nodes that are among the members of this var-args parameter 172 * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search 173 * loop will return all ancestor nodes of the index node. 174 * 175 * <BR /><BR /><DIV CLASS=JDHint> 176 * <B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to this 177 * method, then the search-loop which is saving all ancestor matches to it's result-set, would 178 * skip over any HTML {@code 'bold'} or {@code 'paragraph'} {@code DotPair's}. 179 * </DIV> 180 * 181 * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'} 182 * (start & end index pair). If no matches are found, an empty {@code Vector} of 183 * zero-elements shall return. These sublists are nearly identical to the Java-Script 184 * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are 185 * constructed by this method. 186 * 187 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 188 * vectorized-html parameter {@code 'html'} 189 * 190 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 191 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 192 * 193 * @see #ALL(Vector, int, HTMLTagCounter) 194 * @see ARGCHECK#index(Vector, int) 195 */ 196 public static Vector<DotPair> allExcept 197 (Vector<? extends HTMLNode> html, int index, String... htmlTags) 198 { 199 return ALL( 200 html, ARGCHECK.index(html, index), 201 new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.ALL) 202 ); 203 } 204 205 206 // ******************************************************************************************** 207 // ******************************************************************************************** 208 // FIND INTERNAL METHODS 209 // ******************************************************************************************** 210 // ******************************************************************************************** 211 212 213 /** 214 * Finds the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding") node pair. 215 * 216 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 217 * @param index This is any index within the bounds of the {@code 'html'} parameter. 218 * @param tagCounter Any internally used counter, to optimize the search routine. 219 * 220 * @return The matching <B STYLE="color: red;">ancestor</B> node's start-and-end index as a 221 * {@code 'DotPair'}. 222 * 223 * @see TagNode 224 * @see HTMLNode 225 * @see DotPair 226 * @see DotPair#isInside(int) 227 * @see Util.Inclusive#dotPairOPT(Vector, int, int) 228 */ 229 protected static DotPair FIRST 230 (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter) 231 { 232 int size = html.size(); 233 TagNode tn; 234 DotPair ret; 235 236 for ( int i=(index-1); 237 (i >= 0) && (! tagCounter.allBanned()); 238 i-- 239 ) 240 241 if ( ((tn = html.elementAt(i).openTag()) != null) 242 && tagCounter.check(tn) 243 && ((ret = Util.Inclusive.dotPairOPT(html, i, size)) != null) 244 && ret.isInside(index) 245 // isInside(...) Should never fail, but 246 ) // This guarantees to prevent erroneous answers 247 248 // If there is a match, return that match, and exit immediately. 249 return ret; 250 251 return null; 252 } 253 254 /** 255 * Finds all <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding"} node pairs. 256 * 257 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 258 * @param index This is any index within the bounds of the {@code 'html'} parameter. 259 * @param tagCounter Any internally used counter, to optimize the search routine. 260 * 261 * @return All matching <B STYLE="color: red;">ancestor</B> nodes' start-and-end index pairs 262 * inside a {@code Vector<DotPair>} 263 * 264 * @see TagNode 265 * @see HTMLNode 266 * @see DotPair 267 * @see DotPair#isInside(int) 268 * @see Util.Inclusive#dotPairOPT(Vector, int, int) 269 */ 270 protected static Vector<DotPair> ALL 271 (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter) 272 { 273 HTMLNode n; TagNode tn; DotPair dp; int size = html.size(); 274 Vector<DotPair> ret = new Vector<>(); 275 276 for (int i=(index-1); (i >= 0) && (! tagCounter.allBanned()); i--) 277 278 if ( (n = html.elementAt(i)).isTagNode() 279 && tagCounter.check(tn = (TagNode) n) 280 ) 281 { 282 if ( ((dp = Util.Inclusive.dotPairOPT(html, i, size)) != null) 283 && dp.isInside(index) 284 ) // isInside(...) Should never fail, but 285 // This guarantees to prevent erroneous answers 286 ret.addElement(dp); 287 288 else 289 // If finding a token match fails, just ignore that token from now on... 290 tagCounter.reportFailed(tn.tok); 291 292 } 293 294 return ret; 295 } 296 297 298 // ******************************************************************************************** 299 // ******************************************************************************************** 300 // Tester, leave it here! It's not doing you no harm. 301 // ******************************************************************************************** 302 // ******************************************************************************************** 303 304 305 static void test(String urlStr, String fileName) throws IOException 306 { 307 // String url = "http://developer.torello.directory/JavaHTML/Version%201/1.4/javadoc/" + 308 // "Torello/HTML/NodeSearch/CommentNodeCount.html"; 309 310 StringBuilder sb = new StringBuilder(); 311 URL url = new URL(urlStr); 312 Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); 313 314 int pos = TextNodeFind.first(page, TextComparitor.CN_CI, "a count of how many"); 315 DotPair dp = Surrounding.firstExcept(page, pos, "li", "body", "div"); 316 317 sb.append("Text Node Found: [" + page.elementAt(pos) + "]\n"); 318 sb.append("Index Found: " + pos + ", DotPair Found: " + dp.toString() + "\n"); 319 sb.append(Debug.print(page, dp, Debug::J) + "\n"); 320 321 Vector<DotPair> allDP = Surrounding.allExcept(page, pos, "body", "html", "div"); 322 323 for (DotPair l : allDP) sb.append( 324 BCYAN + 325 "************************************************************\n" + 326 "************************************************************\n" + RESET + 327 "Index Found: " + pos + ", DotPair Found: " + l.toString() + "\n" + 328 "Starting Node: " + BRED + page.elementAt(l.start).str + RESET + "\n" + 329 "Ending Node:" + BRED + page.elementAt(l.end).str + RESET + "\n" 330 ); 331 332 String s = sb.toString(); 333 System.out.println(s); 334 335 if (fileName != null) 336 FileRW.writeFile(C.toHTML(s.replace("<", "<").replace(">", ">")), fileName); 337 } 338}