001/*
002 *  $URL: svn://svn.webarts.bc.ca/open/trunk/projects/WebARTS/ca/bc/webarts/tools/UrlScraper.java $
003 *  $Author: tgutwin $
004 *  $Revision: 1255 $
005 *  $Date: 2018-03-17 20:34:04 -0700 (Sat, 17 Mar 2018) $
006 */
007/*
008 *  $Rev: 1255 $:     Revision of last commit
009 *  $Author: tgutwin $:  Author of last commit
010 *  $Date: 2018-03-17 20:34:04 -0700 (Sat, 17 Mar 2018) $:    Date of last commit
011 *  Copyright (C) 2017-2018    WebARTS Design,
012 *  North Vancouver Canada. All Rights Reserved.
013 *
014 *  Written by Tom Gutwin - WebARTS Design.
015 *  http://www.webarts.bc.ca
016 *
017 *  This program is free software; you can redistribute it and/or modify
018 *  it under the terms of the GNU General Public License as published by
019 *  the Free Software Foundation; either version 2 of the License, or
020 *  (at your option) any later version.
021 *
022 *  This program is distributed in the hope that it will be useful,
023 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
024 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
025 *  GNU General Public License for more details.
026 *
027 *  You should have received a copy of the GNU General Public License
028 *  along with this program; if not, write to the Free Software
029 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
030 */
031package ca.bc.webarts.tools;
032
033
034// import ca.bc.webarts.tools.MyCookieHandler;
035import java.io.BufferedReader;
036import java.io.DataOutputStream;
037import java.io.File;
038import java.io.FileNotFoundException;
039import java.io.FileOutputStream;
040import java.io.FileReader;
041import java.io.IOException;
042import java.io.InputStreamReader;
043import java.io.StringReader;
044import java.io.UnsupportedEncodingException;
045import java.net.CookieHandler;
046import java.net.CookieManager;
047import java.net.MalformedURLException;
048import java.net.ProtocolException;
049import java.net.URL;
050import java.net.URLEncoder;
051import java.util.ArrayList;
052import java.util.Calendar;
053import java.util.HashMap;
054import java.util.List;
055
056import javax.json.*;
057import javax.net.ssl.HttpsURLConnection;
058
059import org.jsoup.Jsoup;
060import org.jsoup.nodes.Document;
061import org.jsoup.nodes.Element;
062import org.jsoup.select.Elements;
063
064
065/**
066 * A very basic tool using JSoup to login to a webPage, get authniticated and then load another page.
067 **/
068public class UrlScraper
069{
070
071  /**  A holder for this clients System File Separator.  */
072  public static final String SYSTEM_FILE_SEPERATOR = File.separator;
073  /**  A holder for this clients System line termination separator.  */
074  public static final String SYSTEM_LINE_SEPERATOR =
075  System.getProperty("line.separator");
076  protected static final String USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0";
077  protected static Calendar rightNow_ = Calendar.getInstance();
078  protected String dateStr_ = rightNow_.get(rightNow_.YEAR) + "-" + (rightNow_.get(rightNow_.MONTH) + 1 < 10 ? "0" + rightNow_.get(rightNow_.MONTH) + 1 : String.valueOf(rightNow_.get(rightNow_.MONTH) + 1)) + "-" + (rightNow_.get(rightNow_.DAY_OF_MONTH) < 10 ? "0" + rightNow_.get(rightNow_.DAY_OF_MONTH) : rightNow_.get(rightNow_.DAY_OF_MONTH));
079  protected boolean debugOut_ = false;
080  protected MyCookieHandler cm_ = new MyCookieHandler();
081  protected List<String> cookies_;
082  protected HttpsURLConnection connection_;
083  protected String baseUrl_ = "/";
084  protected boolean alreadyLoggedIn_ = false;
085  private String loginUrl_ = "";
086  /** Cache of the response text returned from the login page post. **/
087  protected String postPageResponse_ = "";
088  private HashMap<String, String> requestProps_ = new HashMap<String, String>();
089  private String loginFormID_ = "";
090  private String usernameFormElementName_ = "";
091  private String passwordFormElementName_ = "";
092  private String username_ = "";
093  private String password_ = "";
094  private String scrapePageUrl_ = "";
095  private String scrapeStart_ = "";
096  private String scrapeEnd_ = "";
097
098  /** Holds the most recent page scrape UNchopped. **/
099  private String pageContentCache_ = "";
100  protected String stockSymbolCache_ = "";
101  protected String marketSymbolCache_ = "";
102
103
104  /** default constructor does nothing. **/
105  public UrlScraper()
106  {
107    // make sure cookies are turn on
108    CookieHandler.setDefault(new CookieManager());
109  }
110
111
112  /** All In One constructor. **/
113  public UrlScraper(String loginUrl,
114                    HashMap<String, String> requestProps,
115                    String loginFormID,
116                    String usernameFormElementName,
117                    String passwordFormElementName,
118                    String username,
119                    String password,
120                    String scrapePageUrl,
121                    String scrapeStart,
122                    String scrapeEnd)
123  {
124    loginUrl_ = loginUrl;requestProps_ = requestProps;loginFormID_ = loginFormID;usernameFormElementName_ = usernameFormElementName;
125    passwordFormElementName_ = passwordFormElementName;username_ = username;password_ = password;scrapePageUrl_ = scrapePageUrl;
126    scrapeStart_ = scrapeStart;scrapeEnd_ = scrapeEnd;
127
128    // make sure cookies are turn on
129    CookieHandler.setDefault(new CookieManager());
130  }
131
132
133  /**
134   * Set Method for class field 'debugOut_' to true. Turns on extra debugging System.out stuff.
135   *
136   **/
137  public void setDebugOut()
138  {
139    this.debugOut_ = true;
140  }  // setDebugOut_ Method
141
142
143  /**
144   * Set Method for class field 'debugOut_'. Turns on/off extra debugging System.out stuff.
145   *
146   * @param debugOut_ is the value to set this class field to.
147   *
148   **/
149  public void setDebugOut(boolean debugOut)
150  {
151    this.debugOut_ = debugOut;
152  }  // setDebugOut_ Method
153
154
155  /**
156   * Get Method for class field 'debugOut_'.
157   *
158   * @return boolean - The value the class field 'debugOut_'.
159   *
160   **/
161  public boolean getDebugOut()
162  {
163    return debugOut_;
164  }  // getDebugOut Method
165
166
167  /**
168   * Set Method for class field 'baseUrl_'.
169   *
170   * @param baseUrl_ is the value to set this class field to.
171   *
172   **/
173  public void setBaseUrl(String baseUrl)
174  {
175    this.baseUrl_ = baseUrl;
176  }  // setBaseUrl Method
177
178
179  /**
180   * Get Method for class field 'baseUrl_'.
181   *
182   * @return String - The value the class field 'baseUrl_'.
183   *
184   **/
185  public String getBaseUrl()
186  {
187    return baseUrl_;
188  }  // getBaseUrl Method
189
190
191  /**
192   * Set Method for class field 'loginUrl_'.
193   *
194   * @param loginUrl is the value to set this class field to.
195   *
196   **/
197  public void setLoginUrl(String loginUrl)
198  {
199    this.loginUrl_ = loginUrl;
200  }  // setLoginUrl Method
201
202
203  /**
204   * Get Method for class field 'loginUrl_'.
205   *
206   * @return String - The value the class field 'loginUrl_'.
207   *
208   **/
209  public String getLoginUrl()
210  {
211    return loginUrl_;
212  }  // getLoginUrl Method
213
214
215  /**
216   * Set Method for class field 'requestProps_'.
217   *
218   * @param requestProps is the value to set this class field to.
219   *
220   **/
221  public void setRequestProps(HashMap<String, String> requestProps)
222  {
223    this.requestProps_ = requestProps;
224  }  // setRequestProps Method
225
226
227  /**
228   * Get Method for class field 'requestProps_'.
229   *
230   * @return HashMap<String, String> - The value the class field 'requestProps_'.
231   *
232   **/
233  public HashMap<String, String> getRequestProps()
234  {
235    return requestProps_;
236  }  // getRequestProps Method
237
238
239  /**
240   * Set Method for class field 'loginFormID_'.
241   *
242   * @param loginFormID is the value to set this class field to.
243   *
244   **/
245  public void setLoginFormID(String loginFormID)
246  {
247    this.loginFormID_ = loginFormID;
248  }  // setLoginFormID Method
249
250
251  /**
252   * Get Method for class field 'loginFormID_'.
253   *
254   * @return String - The value the class field 'loginFormID_'.
255   *
256   **/
257  public String getLoginFormID()
258  {
259    return loginFormID_;
260  }  // getLoginFormID Method
261
262
263  /**
264   * Set Method for class field 'usernameFormElementName_'.
265   *
266   * @param usernameFormElementName_ is the value to set this class field to.
267   *
268   **/
269  public void setUsernameFormElementName(String usernameFormElementName)
270  {
271    this.usernameFormElementName_ = usernameFormElementName;
272  }  // setUsernameFormElementName_ Method
273
274
275  /**
276   * Get Method for class field 'usernameFormElementName_'.
277   *
278   * @return String - The value the class field 'usernameFormElementName_'.
279   *
280   **/
281  public String getUsernameFormElementName()
282  {
283    return usernameFormElementName_;
284  }  // getUsernameFormElementName Method
285
286
287  /**
288   * Set Method for class field 'passwordFormElementName_'.
289   *
290   * @param passwordFormElementName_ is the value to set this class field to.
291   *
292   **/
293  public void setPasswordFormElementName(String passwordFormElementName)
294  {
295    this.passwordFormElementName_ = passwordFormElementName;
296  }  // setPasswordFormElementName Method
297
298
299  /**
300   * Get Method for class field 'passwordFormElementName_'.
301   *
302   * @return String - The value the class field 'passwordFormElementName_'.
303   *
304   **/
305  public String getPasswordFormElementName()
306  {
307    return passwordFormElementName_;
308  }  // getPasswordFormElementName Method
309
310
311  /**
312   * Set Method for class field 'username_'.
313   *
314   * @param username_ is the value to set this class field to.
315   *
316   **/
317  public void setUsername(String username)
318  {
319    this.username_ = username;
320  }  // setUsername Method
321
322
323  /**
324   * Get Method for class field 'username_'.
325   *
326   * @return String - The value the class field 'username_'.
327   *
328   **/
329  public String getUsername()
330  {
331    return username_;
332  }  // getUsername Method
333
334
335  /**
336   * Set Method for class field 'password_'.
337   *
338   * @param password_ is the value to set this class field to.
339   *
340   **/
341  public void setPassword(String password)
342  {
343    this.password_ = password;
344  }  // setPassword Method
345
346
347  /**
348   * Get Method for class field 'password_'.
349   *
350   * @return String - The value the class field 'password_'.
351   *
352   **/
353  public String getPassword()
354  {
355    return password_;
356  }  // getPassword Method
357
358
359  /**
360   * Set Method for class field 'scrapePageUrl_'.
361   *
362   * @param scrapePageUrl_ is the value to set this class field to.
363   *
364   **/
365  public void setScrapePageUrl(String scrapePageUrl)
366  {
367    this.scrapePageUrl_ = scrapePageUrl;
368  }  // setScrapePageUrl_ Method
369
370
371  /**
372   * Get Method for class field 'scrapePageUrl_'.
373   *
374   * @return String - The value the class field 'scrapePageUrl_'.
375   *
376   **/
377  public String getScrapePageUrl()
378  {
379    return scrapePageUrl_;
380  }  // getScrapePageUrl Method
381
382
383  /**
384   * Set Method for class field 'scrapeStart_'.
385   * This field is used as the substring start tag for the return when searching the response string.
386   *
387   * @param scrapeStart_ is the value to set this class field to.
388   *
389   **/
390  public void setScrapeStart(String scrapeStart)
391  {
392    this.scrapeStart_ = scrapeStart;
393  }  // setScrapeStart Method
394
395
396  /**
397   * Get Method for class field 'scrapeStart_'.
398   * This field is used as the substring start tag for the return when searching the response string.
399   *
400   * @return String - The value the class field 'scrapeStart_'.
401   *
402   **/
403  public String getScrapeStart()
404  {
405    return scrapeStart_;
406  }  // getScrapeStart Method
407
408
409  /**
410   * Set Method for class field 'scrapeEnd_'.
411   * This field is used as the substring end tag for the return when searching the response string.
412   *
413   * @param scrapeEnd_ is the value to set this class field to.
414   *
415   **/
416  public void setScrapeEnd(String scrapeEnd)
417  {
418    this.scrapeEnd_ = scrapeEnd;
419  }  // setScrapeEnd Method
420
421
422  /**
423   * Get Method for class field 'scrapeEnd_'.
424   * This field is used as the substring end tag for the return when searching the response string.
425   *
426   * @return String - The value the class field 'scrapeEnd_'.
427   *
428   **/
429  public String getScrapeEnd()
430  {
431    return scrapeEnd_;
432  }  // getScrapeEnd Method
433
434
435  public List<String> getCookies()
436  {
437    return cookies_;
438  }
439
440
441  public void setCookies(List<String> cookies)
442  {
443    if (debugOut_)
444    {
445      System.out.println("New cookies: " + cookies);
446      for (String cookie : cookies) {
447        System.out.println("    --> cookie " + cookie);
448      }
449    }
450
451
452    this.cookies_ = cookies;
453  }
454
455
456  /**
457   * Set Method for class field 'connection_'.
458   *
459   * @param connection_ is the value to set this class field to.
460   *
461   **/
462  public void setConnection(HttpsURLConnection connection)
463  {
464    this.connection_ = connection;
465  }  // setConnection Method
466
467
468  /**
469   * Get Method for class field 'connection_'.
470   *
471   * @return HttpsURLConnection - The value the class field 'connection_'.
472   *
473   **/
474  public HttpsURLConnection getConnection()
475  {
476    return connection_;
477  }  // getConnection Method
478
479
480  /**
481   * Sends the POST to the login url  parameters from the classVars.
482   * It also caches the page response text into thew classVar postPageResponse_.<br>
483   */
484  public boolean doLogin()
485  {
486    return this.doLogin(loginUrl_, requestProps_, loginFormID_, usernameFormElementName_, passwordFormElementName_, username_, password_);
487  }
488
489
490  /**
491   * Sends the POST to the login url with all required parameters.
492   * It also caches the page response text into thew classVar postPageResponse_.<br>
493   * <br> example requestProps<pre>
494   *     HashMap <String, String> reqProps = new HashMap<String, String>();
495   *     reqProps.put("Accept","text/html,application/xhtml+xml,application/xml");
496   *     reqProps.put("Accept-Encoding  ","gzip, deflate, br");
497   *     reqProps.put("Accept-Language  ","en-US,en;q=0.5");
498   *     reqProps.put("Connection","keep-alive");
499   *     reqProps.put("Content-Type","application/x-www-form-urlencoded");
500   *     reqProps.put("Host","red.webarts.bc.ca");
501   *     //reqProps.put("Referer","red.webarts.bc.ca");
502   *     reqProps.put("Upgrade-Insecure-Requests","1");
503   *</pre><br><B>NOTE:</B> you don't need to add the "User-Agent" or "Content-Length"
504   *
505   * @param loginUrl
506   * @param requestProps
507   * @param formID
508   * @param usernameFormElementName
509   * @param passwordFormElementName
510   * @param username
511   * @param password
512   * @return boolean true if loggedIn
513   **/
514  public boolean doLogin( String loginUrl,
515                          HashMap<String, String> requestProps,
516                          String formID,
517                          String usernameFormElementName,
518                          String passwordFormElementName,
519                          String username,
520                          String password)
521  {
522    boolean retVal = false;
523    try
524    {
525      baseUrl_ = loginUrl.substring(0, loginUrl.lastIndexOf('/') - 1);
526      if (debugOut_) System.out.println("\n doLogin \n");
527      if (debugOut_) System.out.println("loginUrl_:" + loginUrl);
528      if (debugOut_) System.out.println("requestProps_:" + requestProps);
529      if (debugOut_) System.out.println("loginFormID_:" + formID);
530      if (debugOut_) System.out.println("usernameFormElementName_:" + usernameFormElementName);
531      if (debugOut_) System.out.println("passwordFormElementName_:" + passwordFormElementName);
532      if (debugOut_) System.out.println("username_,password_:" + username + " , " + password);
533      String page = getPageContent(loginUrl);
534      if (debugOut_) System.out.println("-------------------\nLoginPage:\n" + page + "\n-----------------\n");
535
536      String postParams = getLoginFormParams(    page,
537      formID,
538      usernameFormElementName, passwordFormElementName,
539      username, password);
540      retVal = (sendPost(loginUrl, postParams, requestProps) == 200 ? true : false);
541    }
542    catch (Exception ex)
543    {
544      retVal = false;
545    }
546    alreadyLoggedIn_ = retVal;
547    return retVal;
548  }
549
550
551  /** Scrape and return string between the scrapeStart and scrapeEnd from the url. **/
552  public String doScrape( String scrapePageUrl,
553                          String scrapeStart,
554                          String scrapeEnd)
555  {
556    String retVal = "";
557    try
558    {
559      String result = getPageContent(scrapePageUrl);
560      retVal = result.substring(    result.indexOf(scrapeStart),
561      result.indexOf(scrapeEnd));
562    }
563    catch (Exception ex)
564    {
565      retVal = "";
566    }
567    return retVal;
568  }
569
570
571  /** Scrape and return string between the scrapeStart and scrapeEnd from the url. **/
572  public String doScrape( String scrapePageUrl,
573                          String scrapeStart,
574                          String scrapeEnd,
575                          HashMap<String, String> reqProps,
576                          boolean useCache)
577  {
578    String retVal = "";
579    try
580    {
581      String result = pageContentCache_;
582      if(!useCache || "".equals(pageContentCache_))
583      {
584        if (debugOut_) System.out.println(" doScrape : "+scrapePageUrl);
585        result = getPageContent(scrapePageUrl, reqProps);
586      }
587      pageContentCache_ = result;
588      int s = result.indexOf(scrapeStart);
589      int e = result.indexOf(scrapeEnd);
590      if (s==-1) s=0;
591      if (e==-1) e=result.length();
592      if (debugOut_) System.out.println("  Scrape Start="+scrapeStart + "   index="+s);
593      if (debugOut_) System.out.println("  Scrape End="+scrapeEnd + "   index="+e);
594      retVal = result.substring( s, e);
595    }
596    catch (Exception ex)
597    {
598      retVal = "";
599    }
600    return retVal;
601  }
602
603
604  /** Scrape (not using cache) and return response string from the url. **/
605  public String doScrape( String scrapePageUrl,
606                          String scrapeStart,
607                          String scrapeEnd,
608                          HashMap<String, String> reqProps)
609  {
610    return doScrape(  scrapePageUrl,
611                      scrapeStart,
612                      scrapeEnd,
613                      reqProps,
614                      false);
615  }
616
617
618  /** Scrape and return response string from the url. **/
619  public String doScrape(String scrapePageUrl, HashMap<String, String> reqProps)
620  {
621    return doScrape(  scrapePageUrl,
622                      getScrapeStart(),
623                      getScrapeEnd(),
624                      reqProps,
625                      false);
626  }
627
628
629  /** scrapes and returns the data string using the default class fields for scrape start and end and requestprops. **/
630  public String doScrape(String scrapePageUrl)
631  {
632    return doScrape(  scrapePageUrl, getScrapeStart(), getScrapeEnd(), getRequestProps(), false);
633  }
634
635
636  /** Scrape (optionally using the cache) and return response string from the url. **/
637  public String doScrape(HashMap<String, String> reqProps, boolean useCache)
638  {
639    return doScrape(getScrapePageUrl(), getScrapeStart(), getScrapeEnd(), reqProps, useCache);
640  }
641
642
643  /** Scrape and return response string from the url. **/
644  public String doScrape(HashMap<String, String> reqProps)
645  {
646    return doScrape(getScrapePageUrl(), getScrapeStart(), getScrapeEnd(), reqProps, false);
647  }
648
649
650  /** Scrape (optionally using the cache) and return response string from the url. **/
651  public String doScrape(boolean useCache)
652  {
653    return doScrape(getScrapePageUrl(), getScrapeStart(), getScrapeEnd(), getRequestProps(), useCache);
654  }
655
656
657  /** Scrape and return response string from the url. **/
658  public String doScrape()
659  {
660    return doScrape(getScrapePageUrl(), getScrapeStart(), getScrapeEnd(), getRequestProps(), false);
661  }
662
663
664  /** Makes the JSON string pretty with indenting. **/
665  public static String prettyJson(String jsonStr)
666  {
667    String retVal = jsonStr;
668    retVal = retVal.replace("[", " [\n");
669    retVal = retVal.replace("]", "  ]" + SYSTEM_LINE_SEPERATOR);
670    retVal = retVal.replace("]\n\"", "  ]\"" + SYSTEM_LINE_SEPERATOR);
671    retVal = retVal.replace("{", "  {" + SYSTEM_LINE_SEPERATOR + "      ");
672    retVal = retVal.replace("}", "}" + SYSTEM_LINE_SEPERATOR);
673    retVal = retVal.replace(",", "," + SYSTEM_LINE_SEPERATOR + "      ");
674    retVal = retVal.replace("}" + SYSTEM_LINE_SEPERATOR + "," + SYSTEM_LINE_SEPERATOR, "    }," + SYSTEM_LINE_SEPERATOR);
675    retVal = retVal.replace("[\n", "  [ ");
676    retVal = retVal.replace("        {", "    {");
677    retVal = retVal.replace("[   {", "[\n   {");
678    retVal = retVal.replace("\n}", "\n    }");
679    return retVal;
680  }
681
682
683  /** As the name says. **/
684  public JsonObject toJsonObject(String jsonStr)
685  {
686    JsonReader jsonReader = Json.createReader(new StringReader(jsonStr));
687    JsonObject jsO = jsonReader.readObject();
688    jsonReader.close();
689    return jsO;
690  }
691
692
693  public JsonStructure toJsonStructure(String jsonStr)
694  {
695    JsonReader jsonReader = Json.createReader(new StringReader(jsonStr));
696    JsonStructure jsO = jsonReader.read();
697    jsonReader.close();
698    return jsO;
699  }
700
701
702  /** Test method to do whatever tests I want. **/
703  protected void test(String[] args)
704  {
705    test(args, getLoginFormID(), getRequestProps(),
706    getLoginFormID(), getUsernameFormElementName(),
707    getPasswordFormElementName(), getUsername(), getPassword());
708  }
709
710
711  /** . **/
712  protected void test(String[] args,
713                      String loginUrl,
714                      HashMap<String, String> requestProps,
715                      String formID,
716                      String usernameFormElementName,
717                      String passwordFormElementName,
718                      String username,
719                      String password)
720  {
721    String result = readFileToString("scrapedPageContent-" + dateStr_ + ".json");
722    JsonStructure jsS = toJsonStructure(result);
723    System.out.println("\n\n---------------------------\nFull Data Structure: \n---------------------------\n" + jsS.toString());
724    System.out.println("ValueType=" + jsS.getValueType());
725    JsonObject jsO = (JsonObject)jsS;
726    JsonArray jsA = jsO.getJsonArray("Data");
727    JsonObject dataHome = jsA.getJsonObject(0);
728    String friendlyName = dataHome.getString("FriendlyName");
729    System.out.println("    FriendlyName       : " + friendlyName);
730    System.out.println("    MarketValue        : " + dataHome.getJsonNumber("MarketValue"));
731    System.out.println("    TradeCash          : " + dataHome.getJsonNumber("TradeCash"));
732    System.out.println("    BookValue          : " + dataHome.getJsonNumber("BookValue"));
733    System.out.println("    ------------------   ------------------------");
734    System.out.println("    UnrealizedGainLoss : " + dataHome.getJsonNumber("UnrealizedGainLoss"));
735    JsonArray holdings = dataHome.getJsonArray("Holdings");
736
737    // System.out.println("holdings ValueType="+holdings.getValueType());
738    System.out.println("\n\n---------------------------\nHoldings:\n---------------------------\n" + prettyJson(holdings.toString()));
739  }
740
741
742  public static void main(String[] args)
743  {
744    String loginUrl = "https://red.webarts.bc.ca/owncloud/";
745    String scrapePageUrl = "https://red.webarts.bc.ca/owncloud/index.php/apps/files/";
746    String ocLoginFormElement = "login";  /* id of the form element */
747    String ocUserLoginElement = "user";  /* the element name in the form to look for */
748    String ocPasswordElement = "password";  /* the element name in the form to look for */
749    String ocUserLogin = "tgutwin";
750    String ocPassword = "";
751
752    // String baseUrl="/";
753    HashMap<String, String> reqProps = new HashMap<String, String>();
754    reqProps.put("Accept", "text/html,application/xhtml+xml,application/xml");
755    reqProps.put("Accept-Encoding       ", "gzip, deflate, br");
756    reqProps.put("Accept-Language       ", "en-US,en;q=0.5");
757    reqProps.put("Connection", "keep-alive");
758    reqProps.put("Content-Type", "application/x-www-form-urlencoded");
759    reqProps.put("Upgrade-Insecure-Requests", "1");
760    reqProps.put("Host", "red.webarts.bc.ca");
761    UrlScraper instance = new UrlScraper();
762    if (args.length > 0 && args[0].toLowerCase().equals("-t"))
763    {
764      instance.test(args, loginUrl, reqProps, ocLoginFormElement, ocUserLoginElement, ocPasswordElement, ocUserLogin, ocPassword);
765    }
766    else
767    {
768
769      /*
770       * @param loginUrl
771       * @param requestProps
772       * @param formID
773       * @param usernameFormElementName
774       * @param passwordFormElementName
775       * @param username
776       * @param password
777       * @return boolean true if loggedIn
778       */
779      boolean success = false;
780      success = instance.doLogin(loginUrl, reqProps, ocLoginFormElement, ocUserLoginElement, ocPasswordElement, ocUserLogin, ocPassword);
781
782      // 4. success then go to the page you want.
783      String result = "";
784      String summaryTableHtml = "";
785      if (success)
786      {
787        result = instance.doScrape(scrapePageUrl);
788        writeStringToFile(result, "ocPageContent-" + instance.dateStr_ + ".html");
789        System.out.println(result);
790      }
791    }
792  }
793
794
795  /**
796   * Sends a POST request to the url, along with all the passed post parameters and request properties.
797   *
798   * @param url is the url to post this stuff to
799   * @param postParams is a sequential string of params that get sent in this post
800   * @param
801   **/
802  protected int sendPost(String url, String postParams, HashMap<String, String> reqProps) throws MalformedURLException, IOException, ProtocolException
803  {
804    if (debugOut_) System.out.println("\n sendPost to url=" + url);
805    URL obj = new URL(url);
806    connection_ = (HttpsURLConnection)obj.openConnection();
807
808    // Act like a browser
809    connection_.setUseCaches(false);
810    connection_.setRequestMethod("POST");
811    connection_.setRequestProperty("User-Agent", USER_AGENT);
812    for (String key : reqProps.keySet())
813    {
814      if (!"User-Agent".equals(key) && !"Content-Length".equals(key) && !"Cookie".equals(key) && !"".equals(key.trim()) && !"".equals(((String)reqProps.get(key)).trim()))
815      {
816        connection_.setRequestProperty(key.trim(), ((String)reqProps.get(key)).trim());
817      }
818
819
820      if (debugOut_)
821      {
822        System.out.println("   setRequestProperty  " + key.trim() + " = " + ((String)reqProps.get(key)).trim());
823      }
824    }
825    connection_.setRequestProperty("Content-Length", Integer.toString(postParams.length()));
826
827    /*
828     * for (String cookie : this.cookies_)
829     * {
830     * if(   !"".equals(cookie)
831     * && !"".equals(((String)cookie.split(";", 1)[0]).trim()) )
832     * {
833     * connection_.addRequestProperty("Cookie", cookie.split(";", 1)[0]);
834     * if(debugOut_) System.out.println("   full  cookie = "+cookie);
835     * if(debugOut_) System.out.println("   adding cookie = "+cookie.split(";", 1)[0]);
836     * }
837     * }
838     */
839    connection_.setDoOutput(true);
840    connection_.setDoInput(true);
841
842    // connection_.setInstanceFollowRedirects(false);
843    cm_.setCookies(connection_);
844    connection_.connect();
845
846    // Send post request
847    DataOutputStream wr = new DataOutputStream(connection_.getOutputStream());
848    wr.writeBytes(postParams);
849    wr.flush();
850    wr.close();
851    int responseCode = connection_.getResponseCode();
852    cm_.storeCookies((java.net.URLConnection)connection_);
853    System.out.println("\nSent 'POST' request to URL : " + url);
854    System.out.println("Post parameters : " + postParams);
855    System.out.println("Response Code : " + responseCode);
856
857    // Read the response
858    BufferedReader in = new BufferedReader(new InputStreamReader(connection_.getInputStream()));
859    String inputLine;
860    StringBuffer response = new StringBuffer();
861    while ((inputLine = in.readLine()) != null)
862    {
863      response.append(inputLine);
864    }
865    in.close();
866    postPageResponse_ = response.toString();
867    if (debugOut_)  System.out.println("\n\n  -----------------\nPostPage Response:  ");
868    if (debugOut_) System.out.println(postPageResponse_);
869
870    writeStringToFile(postPageResponse_, "postPageResponse.html");
871    return responseCode;
872  }
873
874
875  /** Connects/retrieves a URL; pulls its cookies and returns the resulting htnl as a string.
876   **/
877  protected String getPageContent(String url) throws MalformedURLException, IOException, ProtocolException
878  {
879    return getPageContent(url, null);
880  }
881
882
883  /**
884   * Connects/retrieves a URL; adds its cookies and returns the resulting htnl as a string.
885   *
886   * @param url is the url to get
887   * @param reqProps optional (can be null) map of properties to add as requestProperties
888   **/
889  protected String getPageContent(String url, HashMap<String, String> reqProps) throws MalformedURLException, IOException, ProtocolException
890  {
891    if (debugOut_)
892    {
893      System.out.println("\n GetPageContent from url= " + url);
894    }
895    URL obj = new URL(url);
896    connection_ = (HttpsURLConnection)obj.openConnection();
897
898    // default is GET
899    connection_.setRequestMethod("GET");
900    connection_.setUseCaches(false);
901
902    // act like a browser
903    connection_.setRequestProperty("User-Agent", USER_AGENT);
904    connection_.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
905    connection_.setRequestProperty("Accept-Language", "en-US,en;q=0.5");
906    if (reqProps != null)
907    {
908      for (String key : reqProps.keySet())
909      {
910        if (!"User-Agent".equals(key) &&
911            !"Content-Length".equals(key) &&
912            !"Cookie".equals(key) &&
913            !"".equals(key.trim()) &&
914            !"".equals(((String)reqProps.get(key)).trim()))
915        {
916          connection_.setRequestProperty(key.trim(), ((String)reqProps.get(key)).trim());
917          if (debugOut_) System.out.println("   setRequestProperty  " + key.trim() + " = " + ((String)reqProps.get(key)).trim());
918        }
919      }
920    }
921    if (cookies_ != null)
922    {
923      for (String cookie : this.cookies_)
924      {
925        if (debugOut_) System.out.println("adding cookies to page request: " + cookie + " (" + cookie.split(";", 1)[0] + ")");
926        connection_.addRequestProperty("Cookie", cookie.split(";", 1)[0]);  // add only the first in a group
927      }
928    }
929
930    cm_.setCookies(connection_);
931    connection_.connect();
932    System.out.println("\nSending 'GET' request to URL : " + url);
933    BufferedReader in = new BufferedReader(new InputStreamReader(connection_.getInputStream()));
934    String inputLine;
935    StringBuffer response = new StringBuffer();
936    while ((inputLine = in.readLine()) != null)
937    {
938      response.append(inputLine);
939    }
940    in.close();
941    int responseCode = connection_.getResponseCode();
942    System.out.println("Response Code : " + responseCode);
943    cm_.storeCookies((java.net.URLConnection)connection_);
944
945    // Get the response cookies
946    setCookies(connection_.getHeaderFields().get("Set-Cookie"));
947    return response.toString();
948  }
949
950
951  /** Concatenates together the URL parameter string (ie. name=value&name2=val2&someOtherParamName=val3  )
952   * for a specific login form in the passed URL/html string. It also subsititutes the username and password for
953   * for the appropriate form elements. <br> This parameterString ends up getting sent to the POST request for a that form.
954   *
955   * @param html is the string representation of the URL that has the form to parse
956   * @param formID the form id (or name) to parse parameters from
957   * @param usernameFormElementName the form elementName used for the username field
958   * @param passwordFormElementNamethe form elementName used for the password field
959   * @param username the actual login username to use in the form
960   * @param password the actual login password to use in the form
961   **/
962  public String getLoginFormParams(String html,
963  String formID,
964  String usernameFormElementName,
965  String passwordFormElementName,
966  String username,
967  String password) throws UnsupportedEncodingException
968  {
969    System.out.println("Extracting form's data...");
970    if (debugOut_)      System.out.println("\n ---------------------------------------- \n" + html + "\n ---------------------------------------- \n");
971
972    Document doc = Jsoup.parse(html);
973    if (debugOut_) System.out.println("\n doc is parsed?=" + (doc != null));
974    if (debugOut_) System.out.println(" Looking for form id=" + formID);
975
976    Element loginform = null;
977    loginform = doc.getElementById(formID);
978    if (loginform == null)
979    {
980      loginform = doc.getElementsByAttributeValue("name", formID).first();
981    }
982    else
983    if (loginform == null)
984    {
985      loginform = doc.getElementsByAttributeValue("class", formID).first();
986    }
987
988    Elements inputElements = loginform.getElementsByTag("input");
989    List<String> paramList = new ArrayList<String>();
990    for (Element inputElement : inputElements) {
991      String key = inputElement.attr("name");
992      String value = inputElement.attr("value");
993      if (debugOut_)
994      {
995        System.out.println("     form element " + key + " = " + value);
996      }
997      if (key.equals(usernameFormElementName))
998      {
999        value = username;
1000      }
1001      else
1002      if (key.equals(passwordFormElementName))
1003      {
1004        value = password;
1005      }
1006      else
1007      if (key.equals("timezone"))
1008      {
1009        value = "America/Los_Angeles";
1010      }
1011      else
1012      if (key.equals("timezone-offset"))
1013      {
1014        value = "-7";
1015      }
1016      paramList.add(key + "=" + URLEncoder.encode(value, "UTF-8"));
1017    }
1018
1019    // build parameters list
1020    StringBuilder result = new StringBuilder();
1021    for (String param : paramList)
1022    {
1023      if (result.length() == 0)
1024      {
1025        result.append(param);
1026      }
1027      else
1028      {
1029        result.append("&" + param);
1030      }
1031    }
1032    return result.toString();
1033  }
1034
1035
1036  /**
1037   * Abstracts the writing of string to a file.
1038   *
1039   * @param s is the String to writeout
1040   * @param fileName is the file name of the file to write the String into
1041   * @return if success.. the full pathed filename is returned else null
1042   **/
1043  public static String writeStringToFile(String s, String fileName)
1044  {
1045    return writeStringToFile(s, fileName, false);
1046  }
1047
1048
1049  /**
1050   * Abstracts the writing of string to a (zip) file (Zip NOT IMPLEMENTED YET).
1051   *
1052   * @param s is the String to writeout
1053   * @param fileName is the file name of the file to write the String into
1054   * @param zipCompress boolean fall to compress with zip compression
1055   * @return if success.. the full pathed filename is returned else null
1056   **/
1057  public static String writeStringToFile(String s, String fileName, boolean zipCompress)
1058  {
1059    String retVal = fileName;
1060    try
1061    {
1062      // FileWriter was not closing the stream
1063      /*
1064       * FileWriter f = new FileWriter(fileName);
1065       * f.write(s);
1066       * f.flush();
1067       * f.close();
1068       * f = null;
1069       */
1070      FileOutputStream fos = new FileOutputStream(fileName);
1071      byte[] strBytes = s.getBytes();
1072      fos.write(strBytes);
1073      fos.flush();
1074      fos.close();
1075      fos = null;
1076      System.gc();  // this is required because a bug in Java won't realease
1077    }
1078    catch (IOException ioEx)
1079    {
1080      System.out.println("\nERROR Writing file: " + fileName);
1081      retVal = null;
1082    }
1083    return retVal;
1084  }
1085
1086
1087  /**
1088   * Abstracts the reading of a file and returns the contents as a String.
1089   *
1090   * @param fileName is the file naem to read into a String
1091   * @return the Text file contents as a String
1092   **/
1093  public static String readFileToString(String fileName)
1094  {
1095    String stringLine;
1096    BufferedReader in;
1097    StringBuffer stringOut = new StringBuffer();
1098    try
1099    {
1100      in = new BufferedReader(new FileReader(fileName));
1101      while ((stringLine = in.readLine()) != null) {
1102        stringOut.append(stringLine);
1103        stringOut.append(SYSTEM_LINE_SEPERATOR);
1104      }
1105    }
1106    catch (FileNotFoundException fnfEx)
1107    {
1108      System.out.println("Cannot find file: " + fileName);
1109    }
1110    catch (IOException ioEx)
1111    {
1112      System.out.println("Error Reading File to String: " + fileName);
1113    }
1114    return stringOut.toString();
1115  }
1116}