public class UrlScraper extends Object
Modifier and Type | Field and Description |
---|---|
protected boolean |
alreadyLoggedIn_ |
protected String |
baseUrl_ |
protected MyCookieHandler |
cm_ |
protected HttpsURLConnection |
connection_ |
protected List<String> |
cookies_ |
protected String |
dateStr_ |
protected boolean |
debugOut_ |
private String |
loginFormID_ |
private String |
loginUrl_ |
protected String |
marketSymbolCache_ |
private String |
pageContentCache_
Holds the most recent page scrape UNchopped.
|
private String |
password_ |
private String |
passwordFormElementName_ |
protected String |
postPageResponse_
Cache of the response text returned from the login page post.
|
private HashMap<String,String> |
requestProps_ |
protected static Calendar |
rightNow_ |
private String |
scrapeEnd_ |
private String |
scrapePageUrl_ |
private String |
scrapeStart_ |
protected String |
stockSymbolCache_ |
static String |
SYSTEM_FILE_SEPERATOR
A holder for this clients System File Separator.
|
static String |
SYSTEM_LINE_SEPERATOR
A holder for this clients System line termination separator.
|
protected static String |
USER_AGENT |
private String |
username_ |
private String |
usernameFormElementName_ |
Constructor and Description |
---|
UrlScraper()
default constructor does nothing.
|
UrlScraper(String loginUrl,
HashMap<String,String> requestProps,
String loginFormID,
String usernameFormElementName,
String passwordFormElementName,
String username,
String password,
String scrapePageUrl,
String scrapeStart,
String scrapeEnd)
All In One constructor.
|
Modifier and Type | Method and Description |
---|---|
boolean |
doLogin()
Sends the POST to the login url parameters from the classVars.
|
boolean |
doLogin(String loginUrl,
HashMap<String,String> requestProps,
String formID,
String usernameFormElementName,
String passwordFormElementName,
String username,
String password)
Sends the POST to the login url with all required parameters.
|
String |
doScrape()
Scrape and return response string from the url.
|
String |
doScrape(boolean useCache)
Scrape (optionally using the cache) and return response string from the url.
|
String |
doScrape(HashMap<String,String> reqProps)
Scrape and return response string from the url.
|
String |
doScrape(HashMap<String,String> reqProps,
boolean useCache)
Scrape (optionally using the cache) and return response string from the url.
|
String |
doScrape(String scrapePageUrl)
scrapes and returns the data string using the default class fields for scrape start and end and requestprops.
|
String |
doScrape(String scrapePageUrl,
HashMap<String,String> reqProps)
Scrape and return response string from the url.
|
String |
doScrape(String scrapePageUrl,
String scrapeStart,
String scrapeEnd)
Scrape and return string between the scrapeStart and scrapeEnd from the url.
|
String |
doScrape(String scrapePageUrl,
String scrapeStart,
String scrapeEnd,
HashMap<String,String> reqProps)
Scrape (not using cache) and return response string from the url.
|
String |
doScrape(String scrapePageUrl,
String scrapeStart,
String scrapeEnd,
HashMap<String,String> reqProps,
boolean useCache)
Scrape and return string between the scrapeStart and scrapeEnd from the url.
|
String |
getBaseUrl()
Get Method for class field 'baseUrl_'.
|
HttpsURLConnection |
getConnection()
Get Method for class field 'connection_'.
|
List<String> |
getCookies() |
boolean |
getDebugOut()
Get Method for class field 'debugOut_'.
|
String |
getLoginFormID()
Get Method for class field 'loginFormID_'.
|
String |
getLoginFormParams(String html,
String formID,
String usernameFormElementName,
String passwordFormElementName,
String username,
String password)
Concatenates together the URL parameter string (ie. name=value&name2=val2&someOtherParamName=val3 )
for a specific login form in the passed URL/html string.
|
String |
getLoginUrl()
Get Method for class field 'loginUrl_'.
|
protected String |
getPageContent(String url)
Connects/retrieves a URL; pulls its cookies and returns the resulting htnl as a string.
|
protected String |
getPageContent(String url,
HashMap<String,String> reqProps)
Connects/retrieves a URL; adds its cookies and returns the resulting htnl as a string.
|
String |
getPassword()
Get Method for class field 'password_'.
|
String |
getPasswordFormElementName()
Get Method for class field 'passwordFormElementName_'.
|
HashMap<String,String> |
getRequestProps()
Get Method for class field 'requestProps_'.
|
String |
getScrapeEnd()
Get Method for class field 'scrapeEnd_'.
|
String |
getScrapePageUrl()
Get Method for class field 'scrapePageUrl_'.
|
String |
getScrapeStart()
Get Method for class field 'scrapeStart_'.
|
String |
getUsername()
Get Method for class field 'username_'.
|
String |
getUsernameFormElementName()
Get Method for class field 'usernameFormElementName_'.
|
static void |
main(String[] args) |
static String |
prettyJson(String jsonStr)
Makes the JSON string pretty with indenting.
|
static String |
readFileToString(String fileName)
Abstracts the reading of a file and returns the contents as a String.
|
protected int |
sendPost(String url,
String postParams,
HashMap<String,String> reqProps)
Sends a POST request to the url, along with all the passed post parameters and request properties.
|
void |
setBaseUrl(String baseUrl)
Set Method for class field 'baseUrl_'.
|
void |
setConnection(HttpsURLConnection connection)
Set Method for class field 'connection_'.
|
void |
setCookies(List<String> cookies) |
void |
setDebugOut()
Set Method for class field 'debugOut_' to true.
|
void |
setDebugOut(boolean debugOut)
Set Method for class field 'debugOut_'.
|
void |
setLoginFormID(String loginFormID)
Set Method for class field 'loginFormID_'.
|
void |
setLoginUrl(String loginUrl)
Set Method for class field 'loginUrl_'.
|
void |
setPassword(String password)
Set Method for class field 'password_'.
|
void |
setPasswordFormElementName(String passwordFormElementName)
Set Method for class field 'passwordFormElementName_'.
|
void |
setRequestProps(HashMap<String,String> requestProps)
Set Method for class field 'requestProps_'.
|
void |
setScrapeEnd(String scrapeEnd)
Set Method for class field 'scrapeEnd_'.
|
void |
setScrapePageUrl(String scrapePageUrl)
Set Method for class field 'scrapePageUrl_'.
|
void |
setScrapeStart(String scrapeStart)
Set Method for class field 'scrapeStart_'.
|
void |
setUsername(String username)
Set Method for class field 'username_'.
|
void |
setUsernameFormElementName(String usernameFormElementName)
Set Method for class field 'usernameFormElementName_'.
|
protected void |
test(String[] args)
Test method to do whatever tests I want.
|
protected void |
test(String[] args,
String loginUrl,
HashMap<String,String> requestProps,
String formID,
String usernameFormElementName,
String passwordFormElementName,
String username,
String password)
.
|
JsonObject |
toJsonObject(String jsonStr)
As the name says.
|
JsonStructure |
toJsonStructure(String jsonStr) |
static String |
writeStringToFile(String s,
String fileName)
Abstracts the writing of string to a file.
|
static String |
writeStringToFile(String s,
String fileName,
boolean zipCompress)
Abstracts the writing of string to a (zip) file (Zip NOT IMPLEMENTED YET).
|
public static final String SYSTEM_FILE_SEPERATOR
public static final String SYSTEM_LINE_SEPERATOR
protected static final String USER_AGENT
protected boolean debugOut_
protected MyCookieHandler cm_
protected HttpsURLConnection connection_
protected boolean alreadyLoggedIn_
protected String postPageResponse_
private HashMap<String,String> requestProps_
private String loginFormID_
private String usernameFormElementName_
private String passwordFormElementName_
private String scrapePageUrl_
private String scrapeStart_
private String scrapeEnd_
private String pageContentCache_
protected String stockSymbolCache_
protected String marketSymbolCache_
public UrlScraper()
public void setDebugOut()
public void setDebugOut(boolean debugOut)
debugOut_
- is the value to set this class field to.public boolean getDebugOut()
public void setBaseUrl(String baseUrl)
baseUrl_
- is the value to set this class field to.public String getBaseUrl()
public void setLoginUrl(String loginUrl)
loginUrl
- is the value to set this class field to.public String getLoginUrl()
public void setRequestProps(HashMap<String,String> requestProps)
requestProps
- is the value to set this class field to.public HashMap<String,String> getRequestProps()
public void setLoginFormID(String loginFormID)
loginFormID
- is the value to set this class field to.public String getLoginFormID()
public void setUsernameFormElementName(String usernameFormElementName)
usernameFormElementName_
- is the value to set this class field to.public String getUsernameFormElementName()
public void setPasswordFormElementName(String passwordFormElementName)
passwordFormElementName_
- is the value to set this class field to.public String getPasswordFormElementName()
public void setUsername(String username)
username_
- is the value to set this class field to.public String getUsername()
public void setPassword(String password)
password_
- is the value to set this class field to.public String getPassword()
public void setScrapePageUrl(String scrapePageUrl)
scrapePageUrl_
- is the value to set this class field to.public String getScrapePageUrl()
public void setScrapeStart(String scrapeStart)
scrapeStart_
- is the value to set this class field to.public String getScrapeStart()
public void setScrapeEnd(String scrapeEnd)
scrapeEnd_
- is the value to set this class field to.public String getScrapeEnd()
public List<String> getCookies()
public void setCookies(List<String> cookies)
public void setConnection(HttpsURLConnection connection)
connection_
- is the value to set this class field to.public HttpsURLConnection getConnection()
public boolean doLogin()
public boolean doLogin(String loginUrl, HashMap<String,String> requestProps, String formID, String usernameFormElementName, String passwordFormElementName, String username, String password)
HashMapreqProps = new HashMap (); reqProps.put("Accept","text/html,application/xhtml+xml,application/xml"); reqProps.put("Accept-Encoding ","gzip, deflate, br"); reqProps.put("Accept-Language ","en-US,en;q=0.5"); reqProps.put("Connection","keep-alive"); reqProps.put("Content-Type","application/x-www-form-urlencoded"); reqProps.put("Host","red.webarts.bc.ca"); //reqProps.put("Referer","red.webarts.bc.ca"); reqProps.put("Upgrade-Insecure-Requests","1");
loginUrl
- requestProps
- formID
- usernameFormElementName
- passwordFormElementName
- username
- password
- public String doScrape(String scrapePageUrl, String scrapeStart, String scrapeEnd)
public String doScrape(String scrapePageUrl, String scrapeStart, String scrapeEnd, HashMap<String,String> reqProps, boolean useCache)
public String doScrape(String scrapePageUrl, String scrapeStart, String scrapeEnd, HashMap<String,String> reqProps)
public String doScrape(String scrapePageUrl, HashMap<String,String> reqProps)
public String doScrape(String scrapePageUrl)
public String doScrape(HashMap<String,String> reqProps, boolean useCache)
public String doScrape(HashMap<String,String> reqProps)
public String doScrape(boolean useCache)
public static String prettyJson(String jsonStr)
public JsonObject toJsonObject(String jsonStr)
public JsonStructure toJsonStructure(String jsonStr)
protected void test(String[] args, String loginUrl, HashMap<String,String> requestProps, String formID, String usernameFormElementName, String passwordFormElementName, String username, String password)
protected int sendPost(String url, String postParams, HashMap<String,String> reqProps) throws MalformedURLException, IOException, ProtocolException
url
- is the url to post this stuff topostParams
- is a sequential string of params that get sent in this postMalformedURLException
IOException
ProtocolException
protected String getPageContent(String url) throws MalformedURLException, IOException, ProtocolException
protected String getPageContent(String url, HashMap<String,String> reqProps) throws MalformedURLException, IOException, ProtocolException
url
- is the url to getreqProps
- optional (can be null) map of properties to add as requestPropertiesMalformedURLException
IOException
ProtocolException
public String getLoginFormParams(String html, String formID, String usernameFormElementName, String passwordFormElementName, String username, String password) throws UnsupportedEncodingException
html
- is the string representation of the URL that has the form to parseformID
- the form id (or name) to parse parameters fromusernameFormElementName
- the form elementName used for the username fieldpasswordFormElementNamethe
- form elementName used for the password fieldusername
- the actual login username to use in the formpassword
- the actual login password to use in the formUnsupportedEncodingException
public static String writeStringToFile(String s, String fileName)
s
- is the String to writeoutfileName
- is the file name of the file to write the String intopublic static String writeStringToFile(String s, String fileName, boolean zipCompress)
s
- is the String to writeoutfileName
- is the file name of the file to write the String intozipCompress
- boolean fall to compress with zip compressionpublic static String readFileToString(String fileName)
fileName
- is the file naem to read into a StringWebARTS Library Licensed Under the GNU - General Public License. Other Libraries licensed under their respective Open Source Licenses