001/*
002 *  NewsScraper.java
003 *  $Header: f:/cvsroot2/open/projects/WebARTS/ca/bc/webarts/widgets/NewsScraper.java,v 1.5 2002/07/26 17:10:30 tgutwin Exp $
004 *  Copyright (c) 2002 Tom Gutwin P.Eng.
005 *
006 *  This program is free software; you can redistribute it and/or
007 *  modify it under the terms of the GNU General Public License
008 *  as published by the Free Software Foundation; either version 2
009 *  of the License, or any later version.
010 *
011 *  This program is distributed in the hope that it will be useful,
012 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
013 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
014 *  GNU General Public License for more details.
015 *
016 *  You should have received a copy of the GNU General Public License
017 *  along with this program; if not, write to the Free Software
018 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
019 */
020package ca.bc.webarts.widgets;
021
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025
026import java.net.HttpURLConnection;
027import java.net.MalformedURLException;
028import java.net.URL;
029import java.net.UnknownServiceException;
030
031import java.util.Date;
032
033
034/**
035 *  This class is a Java Bean to Scrape html from existing websites.It is useful
036 *  in JSPs and servlets when you want to include news feeds or stock updates
037 *  from other sites on your page.<BR />
038 *  <B>Example Usage</B> (JSP file):
039 * <pre>
040 *
041 *  &lt;%@ page language="java" %>
042 *  &lt;jsp:useBean id="newsBean"
043 *               class="ca.bc.webarts.widgets.NewsScraper"
044 *               scope="page" />
045 *  &lt;HTML>&lt;BODY>
046 *
047 *  &lt;tablecellpadding="2" border="1">
048 *    &lt;tr>
049 *      &lt;td valign="top" width="60%">
050 *        &lt;center>&lt;U>Java News&lt;/U> &lt;/center> &lt;BR />
051 *        &lt;!-- Java News Feed -->
052 *        &lt;jsp:setProperty
053 *              name="newsBean"
054 *              property="connectionUrlStr"
055 *              value="http://www-105.ibm.com/developerworks/news.nsf/dw/java-current-bydate?OpenDocument&amp;Count=500&amp;loc=j"
056 *        />
057 *        &lt;jsp:setProperty
058 *              name="newsBean"
059 *              property="startString"
060 *              value="&lt;!-- START CONTENT AREA -->" />
061 *        &lt;jsp:setProperty
062 *              name="newsBean"
063 *              property="endString"
064 *              value="&lt;!-- END CONTENT AREA -->" />
065 *        &lt;jsp:setProperty
066 *             name="newsBean"
067 *             property="searchScrapeTerm"
068 *             value="developerworks/cgi-bin/click.cgi?url=" />
069 *        &lt;span id="small">
070 *          &lt;%= newsBean.doFullScrape() %>
071 *        &lt;/span>
072 *      &lt;/td>
073 *    &lt;/tr>
074 *  &lt;/table>
075 *  &lt;/body> &lt;/html>
076 * </pre>
077 *
078 * @author     tgutwin
079 * @created    June 29, 2002
080 */
081public class NewsScraper
082{
083  /**  The URL Connection that will be the source for the scraped items */
084  private HttpURLConnection newsSourceConnection = null;
085  /**  The reader for that will read from the URL Connection. */
086  private InputStreamReader newsReader = null;
087  /**  The In Stream from the URL Connection. */
088  private InputStream newsStream = null;
089  /**  The String representation of the URL to connect to */
090  private String connectionUrlStr = "";
091  /**
092   *  An string used to mark the start of the usefull data from the URL
093   *  Connection data. Any data outside the startString/endString tags will not
094   *  even be brought down to analyze.
095   */
096  private String startString = "<html>";
097  /**
098   *  An string used to mark the end of the usefull data from the URL Connection
099   *  data. Any data outside the startString/endString tags will not even be
100   *  brought down to analyze.
101   */
102  private String endString = "</html>";
103  /**
104   *  An string used to mark the start of the scraped data from the URL
105   *  Connection data. All data BETWEEN the end of this String (<PRE>searchScrapeTerm</PRE>
106   *  ) and the start of the the <PRE>endSearchScrapeTerm</PRE> will be used as
107   *  one item of scraped data.
108   */
109  private String searchScrapeTerm = "";
110  /**
111   *  An string used to mark the start of the scraped data from the URL
112   *  Connection data. All data BETWEEN the end of the <PRE>searchScrapeTerm</PRE>
113   *  and the start of the the <PRE>endSearchScrapeTerm</PRE> will be used as
114   *  one item of scraped data.
115   */
116  private String endSearchScrapeTerm = "</a";
117  /**  A string that will hold the scraped data */
118  private String fullScrape_ = "";
119  /**
120   *  The maximum number of scraped items to store in the <PRE>fullScrape_</PRE>
121   *  variable.
122   */
123  private int maxNewsItems_ = 15;
124  /**
125   *  The URL String that will get prepended to the front of the scraped News
126   *  link.
127   */
128  private String linkPrependUrlStr_ = "";
129  /**
130   *  The Seconds to wait before revisiting the URL Connection to update the
131   *  data that was scraped. (the cached time in Seconds).
132   */
133  private int cacheTime_ = 30 * 60;
134  // 30 minutes
135  private boolean useCachedResults_ = true;
136
137  /*
138   *  the following fields are not set/get by the users.
139   */
140      /**  Description of the Field */
141  private long lastVisitTime_ = 0;
142  private String content = "";
143
144
145
146  /**  Constructor for the NewsScraper object  */
147  public NewsScraper() { }
148
149
150  /**
151   *  Constructor for the NewsScraper object
152   *
153   * @param  con  Description of the Parameter
154   */
155  public NewsScraper(HttpURLConnection con)
156  {
157    setNewsSourceConnection(con);
158  }
159
160
161  /**
162   *  Constructor for the NewsScraper object
163   *
164   * @param  con   Description of the Parameter
165   * @param  strt  Description of the Parameter
166   * @param  end   Description of the Parameter
167   */
168  public NewsScraper(HttpURLConnection con, String strt, String end)
169  {
170    setNewsSourceConnection(con);
171    setStartString(strt);
172    setEndString(end);
173  }
174
175
176  /**
177   *  Constructor for the NewsScraper object
178   *
179   * @param  connectionUrlStr  Description of the Parameter
180   * @param  strt              Description of the Parameter
181   * @param  end               Description of the Parameter
182   * @param  searchTerm        Description of the Parameter
183   */
184  public NewsScraper(String connectionUrlStr, String strt, String end, String searchTerm)
185  {
186    //try
187    //{
188    setSearchScrapeTerm(searchTerm);
189    setConnectionUrlStr(connectionUrlStr);
190    setStartString(strt);
191    setEndString(end);
192    /*
193     *  newsSourceConnection = (HttpURLConnection)
194     *  (new URL(connectionUrlStr)).openConnection();
195     *  setNewsSourceConnection(newsSourceConnection);
196     *  }
197     *  catch (MalformedURLException badUrlEx)
198     *  {
199     *  newsSourceConnection = null;
200     *  }
201     *  catch (IOException IOEx)
202     *  {
203     *  newsSourceConnection = null;
204     *  }
205     */
206  }
207
208
209  /**
210   *  Sets the maxNewsItems_ attribute of the NewsScraper object
211   *
212   * @param  maxNewsItems_  The new maxNewsItems_ value
213   */
214  public void setMaxNewsItems(int maxNewsItems_)
215  {
216    this.maxNewsItems_ = maxNewsItems_;
217  }
218
219
220
221  /**
222   *  Gets the maxNewsItems_ attribute of the NewsScraper object
223   *
224   * @return    The maxNewsItems_ value
225   */
226  public int getMaxNewsItems()
227  {
228    return maxNewsItems_;
229  }
230
231
232
233  /**
234   *  Sets the newsSourceConnection attribute of the NewsScraper object
235   *
236   * @param  newsSourceConnection  The new newsSourceConnection value
237   */
238  public void setNewsSourceConnection(HttpURLConnection newsSourceConnection)
239  {
240    this.newsSourceConnection = newsSourceConnection;
241  }
242
243
244  /**
245   *  Gets the newsSourceConnection attribute of the NewsScraper object
246   *
247   * @return    The newsSourceConnection value
248   */
249  public HttpURLConnection getNewsSourceConnection()
250  {
251    return newsSourceConnection;
252  }
253
254
255  /**
256   *  Sets the fullScrape_ attribute of the NewsScraper object
257   *
258   * @param  fullScrape_  The new fullScrape_ value
259   */
260  public void setFullScrape(String fullScrape_)
261  {
262    this.fullScrape_ = fullScrape_;
263  }
264
265
266  /**
267   *  Gets the fullScrape_ attribute of the NewsScraper object
268   *
269   * @return    The fullScrape_ value
270   */
271  public String getFullScrape()
272  {
273    return fullScrape_;
274  }
275
276
277  /**
278   *  Sets the searchScrapeTerm attribute of the NewsScraper object
279   *
280   * @param  searchScrapeTerm  The new searchScrapeTerm value
281   */
282  public void setSearchScrapeTerm(String searchScrapeTerm)
283  {
284    this.searchScrapeTerm = searchScrapeTerm;
285  }
286
287
288  /**
289   *  Gets the searchScrapeTerm attribute of the NewsScraper object
290   *
291   * @return    The searchScrapeTerm value
292   */
293  public String getSearchScrapeTerm()
294  {
295    return searchScrapeTerm;
296  }
297
298
299  /**
300   *  Sets the endSearchScrapeTerm attribute of the NewsScraper object
301   *
302   * @param  endSearchScrapeTerm  The new endSearchScrapeTerm value
303   */
304  public void setEndSearchScrapeTerm(String endSearchScrapeTerm)
305  {
306    this.endSearchScrapeTerm = endSearchScrapeTerm;
307  }
308
309
310  /**
311   *  Gets the endSearchScrapeTerm attribute of the NewsScraper object
312   *
313   * @return    The searchScrapeTerm value
314   */
315  public String getEndSearchScrapeTerm()
316  {
317    return endSearchScrapeTerm;
318  }
319
320
321  public void setLinkPrependUrlStr(String linkPrependUrlStr)
322  {
323    this.linkPrependUrlStr_ = linkPrependUrlStr;
324  }
325
326
327
328  public String getLinkPrependUrlStr()
329  {
330    return linkPrependUrlStr_;
331  }
332
333
334
335  /**
336   *  Sets the connectionUrlStr attribute of the NewsScraper object
337   *
338   * @param  connectionUrlStr  The new connectionUrlStr value
339   */
340  public void setConnectionUrlStr(String connectionUrlStr)
341  {
342    if (!getConnectionUrlStr().trim().equals(connectionUrlStr.trim()))
343    {
344      useCachedResults_ = false;
345      setFullScrape("");
346      setNewsSourceConnection(null);
347      setLinkPrependUrlStr("");
348    }
349    this.connectionUrlStr = connectionUrlStr;
350  }
351
352
353  /**
354   *  Gets the connectionUrlStr attribute of the NewsScraper object
355   *
356   * @return    The connectionUrlStr value
357   */
358  public String getConnectionUrlStr()
359  {
360    return connectionUrlStr;
361  }
362
363
364  /**
365   *  Sets the startString attribute of the NewsScraper object
366   *
367   * @param  strtString  The new startString value
368   */
369  public void setStartString(String strtString)
370  {
371    startString = strtString;
372  }
373
374
375  /**
376   *  Gets the startString attribute of the NewsScraper object
377   *
378   * @return    The startString value
379   */
380  public String getStartString()
381  {
382    return startString;
383  }
384
385
386  /**
387   *  Sets the endString attribute of the NewsScraper object
388   *
389   * @param  endString  The new endString value
390   */
391  public void setEndString(String endString)
392  {
393    this.endString = endString;
394  }
395
396
397  /**
398   *  Gets the endString attribute of the NewsScraper object
399   *
400   * @return    The endString value
401   */
402  public String getEndString()
403  {
404    return endString;
405  }
406
407
408  /**
409   *  Sets the cacheTime_ attribute of the NewsScraper object
410   *
411   * @param  cacheTime_  The new cacheTime_ value
412   */
413  public void setCacheTime(int cacheTime_)
414  {
415    this.cacheTime_ = cacheTime_;
416  }
417
418
419
420  /**
421   *  Gets the cacheTime_ attribute of the NewsScraper object
422   *
423   * @return    The cacheTime_ value
424   */
425  public int getCacheTime()
426  {
427    return cacheTime_;
428  }
429
430
431  /**
432   * This method takes the passed striing and removes any internal HTML tags
433   * That might screw up the formatting of the scrape.
434   *
435   * @param  scrapedEntry  The scraped String to parse
436   *
437   * @return    The requested data as a String.
438   **/
439  private String cleanScrapedEntry(String scrapedEntry)
440  {
441    // remove form, input, select, table, td, tr, font, img
442    StringBuffer retVal = new StringBuffer(scrapedEntry);
443    String [] tagsToRemove = {"<form", "<input", "<select", "<table", "<td",
444                            "<tr", "<font", "<img", "<b",
445                            "</form", "</input", "</select", "</table", "</td",
446                            "</tr", "</font", "</img", "</b"
447                            };
448    boolean done = false;
449    int spot = -1;
450    int endOfSpot = -1;
451    while(!done)
452    {
453      done = true;
454      for (int i=0; i < tagsToRemove.length; i++)
455      if ((spot = scrapedEntry.trim().toLowerCase().indexOf(
456           tagsToRemove[i])) != -1)
457      {
458        endOfSpot = scrapedEntry.trim().toLowerCase().indexOf(">",spot+1)+2;
459        if (endOfSpot != -1)
460        {
461          retVal = null;
462          retVal = new StringBuffer(scrapedEntry.substring(0,spot));
463          retVal.append(scrapedEntry.substring(endOfSpot));
464          done = false;
465          scrapedEntry = retVal.toString();
466        }
467      }
468    }
469    spot = -1;
470    while((spot = scrapedEntry.trim().toLowerCase().indexOf("'")) != -1)
471    {
472      retVal = null;
473      retVal = new StringBuffer(scrapedEntry.substring(0,spot));
474      retVal.append("\"");
475      if (spot+1 < scrapedEntry.length())
476        retVal.append(scrapedEntry.substring(spot+1));
477      scrapedEntry = retVal.toString();
478    }
479    return retVal.toString();
480
481}
482
483  /**
484   *  Goes to the predefined URL and scrapes the requested data.
485   *
486   * @return    The requested data as a String holding an html Ordered List
487   *            "&lt;ol>".
488   */
489  public String doFullScrape()
490  {
491    StringBuffer retVal = new StringBuffer("");
492    StringBuffer contentBuffer = new StringBuffer("");
493    int totalBytesRead = 0;
494    int currentBytesRead = 0;
495    int maxRead = 5000;
496
497    //Date date = new Date();
498    long currentTime_ = System.currentTimeMillis();
499    System.out.println("lastVisitTime_ = " + new Date(lastVisitTime_).toString());
500    System.out.println("     currentTime_ = " + new Date(currentTime_).toString());
501    //System.out.println("FullScrape().length() = "+getFullScrape().length() );
502    if (!(getFullScrape().length() > 0) ||
503        currentTime_ - lastVisitTime_ > getCacheTime() * 1000)
504    {
505      useCachedResults_ = false;
506    }
507    System.out.println("Use Cache? " + useCachedResults_);
508
509    if (newsSourceConnection == null && !useCachedResults_)
510    {
511      try
512      {
513        newsSourceConnection = (HttpURLConnection)
514            (new URL(getConnectionUrlStr())).openConnection();
515        setNewsSourceConnection(newsSourceConnection);
516        //date = new Date();
517        lastVisitTime_ = System.currentTimeMillis();
518      }
519      catch (MalformedURLException badUrlEx)
520      {
521        newsSourceConnection = null;
522      }
523      catch (IOException IOEx)
524      {
525        newsSourceConnection = null;
526      }
527
528    }
529
530    if (newsSourceConnection != null && !useCachedResults_)
531    {
532      try
533      {
534        newsStream = newsSourceConnection.getInputStream();
535        if (newsStream == null)
536        {
537          retVal.append("Sorry, Java News Feed is unavailable.\n");
538          retVal.append(getFullScrape());
539        }
540        else
541        {
542          setFullScrape("");
543          newsReader = new InputStreamReader(newsStream);
544          char[] chars = new char[maxRead];
545
546          while (currentBytesRead >= 0)
547          {
548            currentBytesRead = newsReader.read(chars);
549            totalBytesRead += currentBytesRead;
550            contentBuffer.append((new String(chars)).trim());
551          }
552
553          newsReader.close();
554          content = contentBuffer.toString().trim();
555          int startSpot = content.indexOf(startString);
556          int endSpot = content.indexOf(endString);
557          if (startSpot != -1 && endSpot != -1)
558          {
559            content = content.substring(startSpot, endSpot);
560            //System.out.println("Found the Tags\n" + startString +"\n"+endString);
561          }
562          int len = content.length();
563          //System.out.println("Spots = "+startSpot +" "+endSpot);
564          //System.out.println(content);
565          //System.out.println("Content Length="+content.length());
566          if (len > 0)
567          {
568            int srchLen = searchScrapeTerm.length();
569            retVal.append("Sorted By Date: ");
570            retVal.append("(cached at ");
571            retVal.append(new Date(lastVisitTime_).toString());
572            retVal.append(")\n");
573            retVal.append("<ol>\n");
574            int countOfItems = 0;
575            for (int i = 0; i < len && countOfItems < maxNewsItems_; i++)
576            {
577              startSpot = content.indexOf(searchScrapeTerm, i);
578              endSpot = content.toUpperCase().indexOf(
579                  endSearchScrapeTerm.toUpperCase(), startSpot);
580              if (startSpot != -1 && endSpot != -1)
581              {
582                String scrapedLink = content.substring(
583                  startSpot + srchLen, endSpot);
584                countOfItems++;
585                retVal.append("<li><a href=\"");
586                if (!scrapedLink.toLowerCase().startsWith("http://"))
587                {
588                  if (!scrapedLink.toLowerCase().startsWith("/"))
589                  {
590                    retVal.append(getConnectionUrlStr());
591                    if (!getConnectionUrlStr().endsWith("/"))
592                      retVal.append("/");
593                  }
594                  else
595                    retVal.append(linkPrependUrlStr_);
596                }
597                retVal.append(cleanScrapedEntry(scrapedLink));
598                if (endSearchScrapeTerm.equals("</a"))
599                  retVal.append("</a>");
600                retVal.append("</li>\n");
601                i = endSpot;
602              }
603            }
604            retVal.append("</ol>\n");
605
606          }
607          else
608          {
609            retVal.append("Sorry, Unable to get Java News Feed results.\n");
610          }
611        }
612      }
613      catch (IndexOutOfBoundsException iEx)
614      {
615        iEx.printStackTrace();
616        retVal.append("Sorry, Unable to get Java News Feed.\n");
617        retVal.append("Total Bytes Read = " + totalBytesRead);
618        retVal.append("\n");
619        retVal.append("Current Bytes Read = " + currentBytesRead);
620        retVal.append("\n");
621      }
622      catch (Exception ex)
623      {
624        ex.printStackTrace();
625        retVal.append("Sorry, Unable to get Java News Feed.");
626      }
627      useCachedResults_ = true;
628      setFullScrape(retVal.toString());
629    }
630    newsSourceConnection = null;
631
632    return getFullScrape();
633  }
634
635
636  /**
637   *  The main program for the NewsScraper class, It serves as a test entry
638   *  point.
639   *
640   * @param  args  The command line arguments
641   */
642  public static void main(String[] args)
643  {
644    NewsScraper news = new NewsScraper(
645        "http://www-105.ibm.com/developerworks/news.nsf/dw/java-current-bydate?OpenDocument&Count=500&loc=j",
646        "<!-- START CONTENT AREA -->",
647        "<!-- END CONTENT AREA -->",
648        "developerworks/cgi-bin/click.cgi?url=");
649    news.doFullScrape();
650    System.out.println(news.getFullScrape());
651    System.out.println(news.content);
652
653    // test the caching
654    //news.doFullScrape();
655    //System.out.println(news.getFullScrape());
656
657
658    news.setConnectionUrlStr("http://www.apple.com/hotnews");
659    news.setStartString("<!-- INSERT TOP STORIES -->");
660    news.setEndString("<!-- END TOP STORIES -->");
661    news.setSearchScrapeTerm("<B><A HREF=\"");
662    news.setEndSearchScrapeTerm("<BR><BR>");
663    news.setMaxNewsItems(10);
664    news.setLinkPrependUrlStr("http://www.apple.com");
665    news.doFullScrape();
666    System.out.println(news.getFullScrape());
667    System.out.println(news.content);
668  }
669}
670