001/* 002 * NewsScraper.java 003 * $Header: f:/cvsroot2/open/projects/WebARTS/ca/bc/webarts/widgets/NewsScraper.java,v 1.5 2002/07/26 17:10:30 tgutwin Exp $ 004 * Copyright (c) 2002 Tom Gutwin P.Eng. 005 * 006 * This program is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU General Public License 008 * as published by the Free Software Foundation; either version 2 009 * of the License, or any later version. 010 * 011 * This program is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 014 * GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License 017 * along with this program; if not, write to the Free Software 018 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 019 */ 020package ca.bc.webarts.widgets; 021 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025 026import java.net.HttpURLConnection; 027import java.net.MalformedURLException; 028import java.net.URL; 029import java.net.UnknownServiceException; 030 031import java.util.Date; 032 033 034/** 035 * This class is a Java Bean to Scrape html from existing websites.It is useful 036 * in JSPs and servlets when you want to include news feeds or stock updates 037 * from other sites on your page.<BR /> 038 * <B>Example Usage</B> (JSP file): 039 * <pre> 040 * 041 * <%@ page language="java" %> 042 * <jsp:useBean id="newsBean" 043 * class="ca.bc.webarts.widgets.NewsScraper" 044 * scope="page" /> 045 * <HTML><BODY> 046 * 047 * <tablecellpadding="2" border="1"> 048 * <tr> 049 * <td valign="top" width="60%"> 050 * <center><U>Java News</U> </center> <BR /> 051 * <!-- Java News Feed --> 052 * <jsp:setProperty 053 * name="newsBean" 054 * property="connectionUrlStr" 055 * value="http://www-105.ibm.com/developerworks/news.nsf/dw/java-current-bydate?OpenDocument&Count=500&loc=j" 056 * /> 057 * <jsp:setProperty 058 * name="newsBean" 059 * property="startString" 060 * value="<!-- START CONTENT AREA -->" /> 061 * <jsp:setProperty 062 * name="newsBean" 063 * property="endString" 064 * value="<!-- END CONTENT AREA -->" /> 065 * <jsp:setProperty 066 * name="newsBean" 067 * property="searchScrapeTerm" 068 * value="developerworks/cgi-bin/click.cgi?url=" /> 069 * <span id="small"> 070 * <%= newsBean.doFullScrape() %> 071 * </span> 072 * </td> 073 * </tr> 074 * </table> 075 * </body> </html> 076 * </pre> 077 * 078 * @author tgutwin 079 * @created June 29, 2002 080 */ 081public class NewsScraper 082{ 083 /** The URL Connection that will be the source for the scraped items */ 084 private HttpURLConnection newsSourceConnection = null; 085 /** The reader for that will read from the URL Connection. */ 086 private InputStreamReader newsReader = null; 087 /** The In Stream from the URL Connection. */ 088 private InputStream newsStream = null; 089 /** The String representation of the URL to connect to */ 090 private String connectionUrlStr = ""; 091 /** 092 * An string used to mark the start of the usefull data from the URL 093 * Connection data. Any data outside the startString/endString tags will not 094 * even be brought down to analyze. 095 */ 096 private String startString = "<html>"; 097 /** 098 * An string used to mark the end of the usefull data from the URL Connection 099 * data. Any data outside the startString/endString tags will not even be 100 * brought down to analyze. 101 */ 102 private String endString = "</html>"; 103 /** 104 * An string used to mark the start of the scraped data from the URL 105 * Connection data. All data BETWEEN the end of this String (<PRE>searchScrapeTerm</PRE> 106 * ) and the start of the the <PRE>endSearchScrapeTerm</PRE> will be used as 107 * one item of scraped data. 108 */ 109 private String searchScrapeTerm = ""; 110 /** 111 * An string used to mark the start of the scraped data from the URL 112 * Connection data. All data BETWEEN the end of the <PRE>searchScrapeTerm</PRE> 113 * and the start of the the <PRE>endSearchScrapeTerm</PRE> will be used as 114 * one item of scraped data. 115 */ 116 private String endSearchScrapeTerm = "</a"; 117 /** A string that will hold the scraped data */ 118 private String fullScrape_ = ""; 119 /** 120 * The maximum number of scraped items to store in the <PRE>fullScrape_</PRE> 121 * variable. 122 */ 123 private int maxNewsItems_ = 15; 124 /** 125 * The URL String that will get prepended to the front of the scraped News 126 * link. 127 */ 128 private String linkPrependUrlStr_ = ""; 129 /** 130 * The Seconds to wait before revisiting the URL Connection to update the 131 * data that was scraped. (the cached time in Seconds). 132 */ 133 private int cacheTime_ = 30 * 60; 134 // 30 minutes 135 private boolean useCachedResults_ = true; 136 137 /* 138 * the following fields are not set/get by the users. 139 */ 140 /** Description of the Field */ 141 private long lastVisitTime_ = 0; 142 private String content = ""; 143 144 145 146 /** Constructor for the NewsScraper object */ 147 public NewsScraper() { } 148 149 150 /** 151 * Constructor for the NewsScraper object 152 * 153 * @param con Description of the Parameter 154 */ 155 public NewsScraper(HttpURLConnection con) 156 { 157 setNewsSourceConnection(con); 158 } 159 160 161 /** 162 * Constructor for the NewsScraper object 163 * 164 * @param con Description of the Parameter 165 * @param strt Description of the Parameter 166 * @param end Description of the Parameter 167 */ 168 public NewsScraper(HttpURLConnection con, String strt, String end) 169 { 170 setNewsSourceConnection(con); 171 setStartString(strt); 172 setEndString(end); 173 } 174 175 176 /** 177 * Constructor for the NewsScraper object 178 * 179 * @param connectionUrlStr Description of the Parameter 180 * @param strt Description of the Parameter 181 * @param end Description of the Parameter 182 * @param searchTerm Description of the Parameter 183 */ 184 public NewsScraper(String connectionUrlStr, String strt, String end, String searchTerm) 185 { 186 //try 187 //{ 188 setSearchScrapeTerm(searchTerm); 189 setConnectionUrlStr(connectionUrlStr); 190 setStartString(strt); 191 setEndString(end); 192 /* 193 * newsSourceConnection = (HttpURLConnection) 194 * (new URL(connectionUrlStr)).openConnection(); 195 * setNewsSourceConnection(newsSourceConnection); 196 * } 197 * catch (MalformedURLException badUrlEx) 198 * { 199 * newsSourceConnection = null; 200 * } 201 * catch (IOException IOEx) 202 * { 203 * newsSourceConnection = null; 204 * } 205 */ 206 } 207 208 209 /** 210 * Sets the maxNewsItems_ attribute of the NewsScraper object 211 * 212 * @param maxNewsItems_ The new maxNewsItems_ value 213 */ 214 public void setMaxNewsItems(int maxNewsItems_) 215 { 216 this.maxNewsItems_ = maxNewsItems_; 217 } 218 219 220 221 /** 222 * Gets the maxNewsItems_ attribute of the NewsScraper object 223 * 224 * @return The maxNewsItems_ value 225 */ 226 public int getMaxNewsItems() 227 { 228 return maxNewsItems_; 229 } 230 231 232 233 /** 234 * Sets the newsSourceConnection attribute of the NewsScraper object 235 * 236 * @param newsSourceConnection The new newsSourceConnection value 237 */ 238 public void setNewsSourceConnection(HttpURLConnection newsSourceConnection) 239 { 240 this.newsSourceConnection = newsSourceConnection; 241 } 242 243 244 /** 245 * Gets the newsSourceConnection attribute of the NewsScraper object 246 * 247 * @return The newsSourceConnection value 248 */ 249 public HttpURLConnection getNewsSourceConnection() 250 { 251 return newsSourceConnection; 252 } 253 254 255 /** 256 * Sets the fullScrape_ attribute of the NewsScraper object 257 * 258 * @param fullScrape_ The new fullScrape_ value 259 */ 260 public void setFullScrape(String fullScrape_) 261 { 262 this.fullScrape_ = fullScrape_; 263 } 264 265 266 /** 267 * Gets the fullScrape_ attribute of the NewsScraper object 268 * 269 * @return The fullScrape_ value 270 */ 271 public String getFullScrape() 272 { 273 return fullScrape_; 274 } 275 276 277 /** 278 * Sets the searchScrapeTerm attribute of the NewsScraper object 279 * 280 * @param searchScrapeTerm The new searchScrapeTerm value 281 */ 282 public void setSearchScrapeTerm(String searchScrapeTerm) 283 { 284 this.searchScrapeTerm = searchScrapeTerm; 285 } 286 287 288 /** 289 * Gets the searchScrapeTerm attribute of the NewsScraper object 290 * 291 * @return The searchScrapeTerm value 292 */ 293 public String getSearchScrapeTerm() 294 { 295 return searchScrapeTerm; 296 } 297 298 299 /** 300 * Sets the endSearchScrapeTerm attribute of the NewsScraper object 301 * 302 * @param endSearchScrapeTerm The new endSearchScrapeTerm value 303 */ 304 public void setEndSearchScrapeTerm(String endSearchScrapeTerm) 305 { 306 this.endSearchScrapeTerm = endSearchScrapeTerm; 307 } 308 309 310 /** 311 * Gets the endSearchScrapeTerm attribute of the NewsScraper object 312 * 313 * @return The searchScrapeTerm value 314 */ 315 public String getEndSearchScrapeTerm() 316 { 317 return endSearchScrapeTerm; 318 } 319 320 321 public void setLinkPrependUrlStr(String linkPrependUrlStr) 322 { 323 this.linkPrependUrlStr_ = linkPrependUrlStr; 324 } 325 326 327 328 public String getLinkPrependUrlStr() 329 { 330 return linkPrependUrlStr_; 331 } 332 333 334 335 /** 336 * Sets the connectionUrlStr attribute of the NewsScraper object 337 * 338 * @param connectionUrlStr The new connectionUrlStr value 339 */ 340 public void setConnectionUrlStr(String connectionUrlStr) 341 { 342 if (!getConnectionUrlStr().trim().equals(connectionUrlStr.trim())) 343 { 344 useCachedResults_ = false; 345 setFullScrape(""); 346 setNewsSourceConnection(null); 347 setLinkPrependUrlStr(""); 348 } 349 this.connectionUrlStr = connectionUrlStr; 350 } 351 352 353 /** 354 * Gets the connectionUrlStr attribute of the NewsScraper object 355 * 356 * @return The connectionUrlStr value 357 */ 358 public String getConnectionUrlStr() 359 { 360 return connectionUrlStr; 361 } 362 363 364 /** 365 * Sets the startString attribute of the NewsScraper object 366 * 367 * @param strtString The new startString value 368 */ 369 public void setStartString(String strtString) 370 { 371 startString = strtString; 372 } 373 374 375 /** 376 * Gets the startString attribute of the NewsScraper object 377 * 378 * @return The startString value 379 */ 380 public String getStartString() 381 { 382 return startString; 383 } 384 385 386 /** 387 * Sets the endString attribute of the NewsScraper object 388 * 389 * @param endString The new endString value 390 */ 391 public void setEndString(String endString) 392 { 393 this.endString = endString; 394 } 395 396 397 /** 398 * Gets the endString attribute of the NewsScraper object 399 * 400 * @return The endString value 401 */ 402 public String getEndString() 403 { 404 return endString; 405 } 406 407 408 /** 409 * Sets the cacheTime_ attribute of the NewsScraper object 410 * 411 * @param cacheTime_ The new cacheTime_ value 412 */ 413 public void setCacheTime(int cacheTime_) 414 { 415 this.cacheTime_ = cacheTime_; 416 } 417 418 419 420 /** 421 * Gets the cacheTime_ attribute of the NewsScraper object 422 * 423 * @return The cacheTime_ value 424 */ 425 public int getCacheTime() 426 { 427 return cacheTime_; 428 } 429 430 431 /** 432 * This method takes the passed striing and removes any internal HTML tags 433 * That might screw up the formatting of the scrape. 434 * 435 * @param scrapedEntry The scraped String to parse 436 * 437 * @return The requested data as a String. 438 **/ 439 private String cleanScrapedEntry(String scrapedEntry) 440 { 441 // remove form, input, select, table, td, tr, font, img 442 StringBuffer retVal = new StringBuffer(scrapedEntry); 443 String [] tagsToRemove = {"<form", "<input", "<select", "<table", "<td", 444 "<tr", "<font", "<img", "<b", 445 "</form", "</input", "</select", "</table", "</td", 446 "</tr", "</font", "</img", "</b" 447 }; 448 boolean done = false; 449 int spot = -1; 450 int endOfSpot = -1; 451 while(!done) 452 { 453 done = true; 454 for (int i=0; i < tagsToRemove.length; i++) 455 if ((spot = scrapedEntry.trim().toLowerCase().indexOf( 456 tagsToRemove[i])) != -1) 457 { 458 endOfSpot = scrapedEntry.trim().toLowerCase().indexOf(">",spot+1)+2; 459 if (endOfSpot != -1) 460 { 461 retVal = null; 462 retVal = new StringBuffer(scrapedEntry.substring(0,spot)); 463 retVal.append(scrapedEntry.substring(endOfSpot)); 464 done = false; 465 scrapedEntry = retVal.toString(); 466 } 467 } 468 } 469 spot = -1; 470 while((spot = scrapedEntry.trim().toLowerCase().indexOf("'")) != -1) 471 { 472 retVal = null; 473 retVal = new StringBuffer(scrapedEntry.substring(0,spot)); 474 retVal.append("\""); 475 if (spot+1 < scrapedEntry.length()) 476 retVal.append(scrapedEntry.substring(spot+1)); 477 scrapedEntry = retVal.toString(); 478 } 479 return retVal.toString(); 480 481} 482 483 /** 484 * Goes to the predefined URL and scrapes the requested data. 485 * 486 * @return The requested data as a String holding an html Ordered List 487 * "<ol>". 488 */ 489 public String doFullScrape() 490 { 491 StringBuffer retVal = new StringBuffer(""); 492 StringBuffer contentBuffer = new StringBuffer(""); 493 int totalBytesRead = 0; 494 int currentBytesRead = 0; 495 int maxRead = 5000; 496 497 //Date date = new Date(); 498 long currentTime_ = System.currentTimeMillis(); 499 System.out.println("lastVisitTime_ = " + new Date(lastVisitTime_).toString()); 500 System.out.println(" currentTime_ = " + new Date(currentTime_).toString()); 501 //System.out.println("FullScrape().length() = "+getFullScrape().length() ); 502 if (!(getFullScrape().length() > 0) || 503 currentTime_ - lastVisitTime_ > getCacheTime() * 1000) 504 { 505 useCachedResults_ = false; 506 } 507 System.out.println("Use Cache? " + useCachedResults_); 508 509 if (newsSourceConnection == null && !useCachedResults_) 510 { 511 try 512 { 513 newsSourceConnection = (HttpURLConnection) 514 (new URL(getConnectionUrlStr())).openConnection(); 515 setNewsSourceConnection(newsSourceConnection); 516 //date = new Date(); 517 lastVisitTime_ = System.currentTimeMillis(); 518 } 519 catch (MalformedURLException badUrlEx) 520 { 521 newsSourceConnection = null; 522 } 523 catch (IOException IOEx) 524 { 525 newsSourceConnection = null; 526 } 527 528 } 529 530 if (newsSourceConnection != null && !useCachedResults_) 531 { 532 try 533 { 534 newsStream = newsSourceConnection.getInputStream(); 535 if (newsStream == null) 536 { 537 retVal.append("Sorry, Java News Feed is unavailable.\n"); 538 retVal.append(getFullScrape()); 539 } 540 else 541 { 542 setFullScrape(""); 543 newsReader = new InputStreamReader(newsStream); 544 char[] chars = new char[maxRead]; 545 546 while (currentBytesRead >= 0) 547 { 548 currentBytesRead = newsReader.read(chars); 549 totalBytesRead += currentBytesRead; 550 contentBuffer.append((new String(chars)).trim()); 551 } 552 553 newsReader.close(); 554 content = contentBuffer.toString().trim(); 555 int startSpot = content.indexOf(startString); 556 int endSpot = content.indexOf(endString); 557 if (startSpot != -1 && endSpot != -1) 558 { 559 content = content.substring(startSpot, endSpot); 560 //System.out.println("Found the Tags\n" + startString +"\n"+endString); 561 } 562 int len = content.length(); 563 //System.out.println("Spots = "+startSpot +" "+endSpot); 564 //System.out.println(content); 565 //System.out.println("Content Length="+content.length()); 566 if (len > 0) 567 { 568 int srchLen = searchScrapeTerm.length(); 569 retVal.append("Sorted By Date: "); 570 retVal.append("(cached at "); 571 retVal.append(new Date(lastVisitTime_).toString()); 572 retVal.append(")\n"); 573 retVal.append("<ol>\n"); 574 int countOfItems = 0; 575 for (int i = 0; i < len && countOfItems < maxNewsItems_; i++) 576 { 577 startSpot = content.indexOf(searchScrapeTerm, i); 578 endSpot = content.toUpperCase().indexOf( 579 endSearchScrapeTerm.toUpperCase(), startSpot); 580 if (startSpot != -1 && endSpot != -1) 581 { 582 String scrapedLink = content.substring( 583 startSpot + srchLen, endSpot); 584 countOfItems++; 585 retVal.append("<li><a href=\""); 586 if (!scrapedLink.toLowerCase().startsWith("http://")) 587 { 588 if (!scrapedLink.toLowerCase().startsWith("/")) 589 { 590 retVal.append(getConnectionUrlStr()); 591 if (!getConnectionUrlStr().endsWith("/")) 592 retVal.append("/"); 593 } 594 else 595 retVal.append(linkPrependUrlStr_); 596 } 597 retVal.append(cleanScrapedEntry(scrapedLink)); 598 if (endSearchScrapeTerm.equals("</a")) 599 retVal.append("</a>"); 600 retVal.append("</li>\n"); 601 i = endSpot; 602 } 603 } 604 retVal.append("</ol>\n"); 605 606 } 607 else 608 { 609 retVal.append("Sorry, Unable to get Java News Feed results.\n"); 610 } 611 } 612 } 613 catch (IndexOutOfBoundsException iEx) 614 { 615 iEx.printStackTrace(); 616 retVal.append("Sorry, Unable to get Java News Feed.\n"); 617 retVal.append("Total Bytes Read = " + totalBytesRead); 618 retVal.append("\n"); 619 retVal.append("Current Bytes Read = " + currentBytesRead); 620 retVal.append("\n"); 621 } 622 catch (Exception ex) 623 { 624 ex.printStackTrace(); 625 retVal.append("Sorry, Unable to get Java News Feed."); 626 } 627 useCachedResults_ = true; 628 setFullScrape(retVal.toString()); 629 } 630 newsSourceConnection = null; 631 632 return getFullScrape(); 633 } 634 635 636 /** 637 * The main program for the NewsScraper class, It serves as a test entry 638 * point. 639 * 640 * @param args The command line arguments 641 */ 642 public static void main(String[] args) 643 { 644 NewsScraper news = new NewsScraper( 645 "http://www-105.ibm.com/developerworks/news.nsf/dw/java-current-bydate?OpenDocument&Count=500&loc=j", 646 "<!-- START CONTENT AREA -->", 647 "<!-- END CONTENT AREA -->", 648 "developerworks/cgi-bin/click.cgi?url="); 649 news.doFullScrape(); 650 System.out.println(news.getFullScrape()); 651 System.out.println(news.content); 652 653 // test the caching 654 //news.doFullScrape(); 655 //System.out.println(news.getFullScrape()); 656 657 658 news.setConnectionUrlStr("http://www.apple.com/hotnews"); 659 news.setStartString("<!-- INSERT TOP STORIES -->"); 660 news.setEndString("<!-- END TOP STORIES -->"); 661 news.setSearchScrapeTerm("<B><A HREF=\""); 662 news.setEndSearchScrapeTerm("<BR><BR>"); 663 news.setMaxNewsItems(10); 664 news.setLinkPrependUrlStr("http://www.apple.com"); 665 news.doFullScrape(); 666 System.out.println(news.getFullScrape()); 667 System.out.println(news.content); 668 } 669} 670