net.web
Class UrlUtils

java.lang.Object
  extended by net.web.UrlUtils

public class UrlUtils
extends java.lang.Object


Field Summary
static java.lang.String ACCEPT
           
static java.lang.String AGENT_NAME
           
static java.lang.String CONTENT_LENGTH
           
static java.lang.String CONTENT_TYPE
           
static java.lang.String DATE_HEADER
           
static java.lang.String FILE_URL_PREFIX
           
static java.lang.String HEADER_PRAGMA
           
static java.lang.String HTTP_URL_PREFIX
           
static java.lang.String METHOD_DELETE
           
static java.lang.String METHOD_GET
           
static java.lang.String METHOD_HEAD
           
static java.lang.String METHOD_OPTIONS
           
static java.lang.String METHOD_POST
           
static java.lang.String METHOD_PUT
           
static java.lang.String METHOD_TRACE
           
static java.lang.String SOAPACTION
           
static java.lang.String SUPPORTED
           
static java.lang.String USER_AGENT
           
 
Constructor Summary
UrlUtils()
           
 
Method Summary
static java.lang.String conditionUrl(java.lang.String s)
          Replace the spaces with %20
static void dataMineCTC()
           
static java.lang.String encode(java.lang.String s)
           
static java.lang.String encodePath(java.lang.String path)
           
static java.net.URL fileToURL(java.io.File file)
          Returns the directory or JAR file URL corresponding to the specified local file name.
static byte[] getBytes(java.net.URL url)
          Read in entire url affineTransform once, into an array of bytes, and return.
static java.lang.String getContentType(java.lang.String urlString)
          Opens a connection to the given URL, gets the content type, then closes \ the connection.
static java.lang.String[] getHrefs(java.lang.String s, java.lang.String containingString)
          Search the string, s, for all the hrefs that contain the suffix
static java.lang.String[] getHrefs(java.lang.String s, java.lang.String searchString, java.lang.String rootUrl)
          Search the string, s, for all the hrefs that are uppercase HREF=
static java.lang.String getOneBigUrlString(java.net.URL url)
          Read in all the text at once, from a given url, and return it in one big string
static java.lang.String getTableToCSVText(java.net.URL url)
          Strip out the html tags and just get the text from the URL Makes CSV data
static java.lang.String[] getTxtGz(java.net.URL url)
          Given a URL of the form fn.txt.gz, this returns an array of string, one per line.
static java.util.Vector getUrl(java.lang.String _urlString)
           
static java.util.Vector getUrl(java.net.URL url)
           
static void getUrl(java.net.URL url, java.io.File f)
          Read a url and put it into a file.
static void getUrlBinary(java.io.File f, java.net.URL url)
           
static void getUrlBinary(java.net.URL url)
           
static java.util.Date getUrlModificationDate(java.net.URL url)
           
static java.lang.String[] getUrlString(java.lang.String[] urlStrings)
          Get the text from each url listed in the urlString array and create on big array.
static java.lang.String[] getUrlStrings(java.net.URL urlString)
          Call this routine if you want one HTML line per array element.
static java.lang.String[] getUrlStrings(java.net.URL url, java.lang.String uid, java.lang.String pw)
          Use basic authentication in order to retrieve the url as an array of strings.
static java.util.Vector getUrlVector(java.net.URL url)
           
static java.lang.String getUTF8Encoding()
           
static java.lang.String html2text(java.lang.String s)
           
static java.lang.String html2text(java.net.URL url)
          Input The HTML at the URL
static boolean isFileUrl(java.lang.String url)
           
static boolean isHttpUrl(java.lang.String url)
           
static java.net.URL[] isolateLinks(java.lang.String suffix, java.net.URL[] urls)
           
static boolean isURLAvailable(java.lang.String urlString, int timeOut)
          Utility method to detect whether an URL is accessible within a timeout.
static java.util.Vector lookForJobs()
           
static void main(java.lang.String[] args)
           
static java.net.URL[] pathToURLs(java.lang.String path)
          Utility method for converting a search path string to an array of directory and JAR file URLs.
static void printLabels()
           
static void processGz(java.net.URL url, LineProcessor lp)
           
static java.lang.String receiveHttpMessage(java.net.Socket socket)
           
static void sendHttpMessage(java.net.Socket socket, java.lang.String command)
           
static void testEdgarExtract()
           
static void textStreamProcessor(java.net.URL u, LineProcessor lineProcessor)
           
static java.lang.String toString(java.lang.String url)
           
static java.lang.String urlEncode(java.lang.String url)
          This will encode a urls odd characters, but I don't know what the default encoding scheme should be, so I set it to null.
 
Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

HTTP_URL_PREFIX

public static final java.lang.String HTTP_URL_PREFIX
See Also:
Constant Field Values

FILE_URL_PREFIX

public static final java.lang.String FILE_URL_PREFIX
See Also:
Constant Field Values

AGENT_NAME

public static final java.lang.String AGENT_NAME
See Also:
Constant Field Values

CONTENT_LENGTH

public static final java.lang.String CONTENT_LENGTH
See Also:
Constant Field Values

CONTENT_TYPE

public static final java.lang.String CONTENT_TYPE
See Also:
Constant Field Values

DATE_HEADER

public static final java.lang.String DATE_HEADER
See Also:
Constant Field Values

SUPPORTED

public static final java.lang.String SUPPORTED
See Also:
Constant Field Values

ACCEPT

public static final java.lang.String ACCEPT
See Also:
Constant Field Values

SOAPACTION

public static final java.lang.String SOAPACTION
See Also:
Constant Field Values

USER_AGENT

public static final java.lang.String USER_AGENT
See Also:
Constant Field Values

METHOD_DELETE

public static final java.lang.String METHOD_DELETE
See Also:
Constant Field Values

METHOD_HEAD

public static final java.lang.String METHOD_HEAD
See Also:
Constant Field Values

METHOD_GET

public static final java.lang.String METHOD_GET
See Also:
Constant Field Values

METHOD_OPTIONS

public static final java.lang.String METHOD_OPTIONS
See Also:
Constant Field Values

METHOD_POST

public static final java.lang.String METHOD_POST
See Also:
Constant Field Values

METHOD_PUT

public static final java.lang.String METHOD_PUT
See Also:
Constant Field Values

METHOD_TRACE

public static final java.lang.String METHOD_TRACE
See Also:
Constant Field Values

HEADER_PRAGMA

public static final java.lang.String HEADER_PRAGMA
See Also:
Constant Field Values
Constructor Detail

UrlUtils

public UrlUtils()
Method Detail

sendHttpMessage

public static void sendHttpMessage(java.net.Socket socket,
                                   java.lang.String command)
                            throws java.io.IOException
Throws:
java.io.IOException

encodePath

public static java.lang.String encodePath(java.lang.String path)

getUTF8Encoding

public static java.lang.String getUTF8Encoding()

receiveHttpMessage

public static java.lang.String receiveHttpMessage(java.net.Socket socket)
                                           throws java.io.IOException
Throws:
java.io.IOException

printLabels

public static void printLabels()
                        throws java.io.IOException,
                               javax.swing.text.BadLocationException
Throws:
java.io.IOException
javax.swing.text.BadLocationException

getContentType

public static java.lang.String getContentType(java.lang.String urlString)
                                       throws java.io.IOException
Opens a connection to the given URL, gets the content type, then closes \ the connection.

Throws:
java.io.IOException

lookForJobs

public static java.util.Vector lookForJobs()
                                    throws java.io.IOException
Throws:
java.io.IOException

getTableToCSVText

public static java.lang.String getTableToCSVText(java.net.URL url)
                                          throws java.io.IOException,
                                                 javax.swing.text.BadLocationException
Strip out the html tags and just get the text from the URL Makes CSV data

Parameters:
url - source of data
Returns:
tag-free text
Throws:
java.io.IOException
javax.swing.text.BadLocationException

html2text

public static java.lang.String html2text(java.lang.String s)
                                  throws java.io.IOException,
                                         javax.swing.text.BadLocationException
Throws:
java.io.IOException
javax.swing.text.BadLocationException

html2text

public static java.lang.String html2text(java.net.URL url)
                                  throws java.io.IOException,
                                         javax.swing.text.BadLocationException
Input The HTML at the URL

Parameters:
url - the source of the html
Returns:
one big plain text string
Throws:
java.io.IOException
javax.swing.text.BadLocationException

getHrefs

public static java.lang.String[] getHrefs(java.lang.String s,
                                          java.lang.String containingString)
Search the string, s, for all the hrefs that contain the suffix

Parameters:
s - raw html used for the search
containingString - stuff we are looking for in the href
Returns:
a list of all the hrefs with the containingString.

getHrefs

public static java.lang.String[] getHrefs(java.lang.String s,
                                          java.lang.String searchString,
                                          java.lang.String rootUrl)
Search the string, s, for all the hrefs that are uppercase HREF=

Parameters:
s - raw html used for the search
Returns:
a list of all the uppercase hrefs.

getOneBigUrlString

public static java.lang.String getOneBigUrlString(java.net.URL url)
                                           throws java.io.IOException
Read in all the text at once, from a given url, and return it in one big string

Parameters:
url - to read from
Returns:
string data for mining
Throws:
java.io.IOException

getUrl

public static void getUrl(java.net.URL url,
                          java.io.File f)
                   throws java.io.IOException
Read a url and put it into a file. This is very good when dealing with large files.

Parameters:
url - input file (like data.jar)
f - locally created output file.
Throws:
java.io.IOException

getBytes

public static byte[] getBytes(java.net.URL url)
Read in entire url affineTransform once, into an array of bytes, and return.

Parameters:
url - an input url that points to binary data
Returns:
url bytes

getUrlStrings

public static java.lang.String[] getUrlStrings(java.net.URL urlString)
                                        throws java.io.IOException
Call this routine if you want one HTML line per array element. Example:

String s[] = UrlUtils.getUrlString(new URL("http://www.docjava.com"));

Parameters:
urlString - input URL
Returns:
array showing contents of web page.
Throws:
java.io.IOException

getUrlString

public static java.lang.String[] getUrlString(java.lang.String[] urlStrings)
                                       throws java.io.IOException
Get the text from each url listed in the urlString array and create on big array.

Parameters:
urlStrings -
Returns:
Throws:
java.io.IOException

conditionUrl

public static java.lang.String conditionUrl(java.lang.String s)
Replace the spaces with %20

Parameters:
s -
Returns:

isolateLinks

public static java.net.URL[] isolateLinks(java.lang.String suffix,
                                          java.net.URL[] urls)
Parameters:
suffix - the string representation of the urls must end with the suffix
urls - a list of urls to sort through
Returns:
an array of urls that end with the suffix

getUrlBinary

public static void getUrlBinary(java.net.URL url)
                         throws java.io.IOException
Throws:
java.io.IOException

getUrlBinary

public static void getUrlBinary(java.io.File f,
                                java.net.URL url)
                         throws java.io.IOException
Throws:
java.io.IOException

getUrlVector

public static java.util.Vector getUrlVector(java.net.URL url)
                                     throws java.io.IOException
Throws:
java.io.IOException

getTxtGz

public static java.lang.String[] getTxtGz(java.net.URL url)
                                   throws java.io.IOException
Given a URL of the form fn.txt.gz, this returns an array of string, one per line. This code can use a LOT of memory. Careful!

Parameters:
url -
Returns:
Throws:
java.io.IOException

processGz

public static void processGz(java.net.URL url,
                             LineProcessor lp)
                      throws java.io.IOException
Throws:
java.io.IOException

urlEncode

public static java.lang.String urlEncode(java.lang.String url)
                                  throws java.io.UnsupportedEncodingException
This will encode a urls odd characters, but I don't know what the default encoding scheme should be, so I set it to null.

Parameters:
url -
Returns:
Throws:
java.io.UnsupportedEncodingException

toString

public static java.lang.String toString(java.lang.String url)

getUrl

public static java.util.Vector getUrl(java.lang.String _urlString)

getUrl

public static java.util.Vector getUrl(java.net.URL url)
                               throws java.io.IOException
Throws:
java.io.IOException

getUrlModificationDate

public static java.util.Date getUrlModificationDate(java.net.URL url)
                                             throws java.io.IOException
Throws:
java.io.IOException

pathToURLs

public static java.net.URL[] pathToURLs(java.lang.String path)
Utility method for converting a search path string to an array of directory and JAR file URLs.

Parameters:
path - the search path string
Returns:
the resulting array of directory and JAR file URLs

fileToURL

public static java.net.URL fileToURL(java.io.File file)
Returns the directory or JAR file URL corresponding to the specified local file name.

Parameters:
file - the File object
Returns:
the resulting directory or JAR file URL, or null if unknown

isURLAvailable

public static boolean isURLAvailable(java.lang.String urlString,
                                     int timeOut)
Utility method to detect whether an URL is accessible within a timeout.

Parameters:
urlString - the URL
timeOut - timeout in milliseconds
Returns:
true if avialable in given time

isFileUrl

public static boolean isFileUrl(java.lang.String url)

isHttpUrl

public static boolean isHttpUrl(java.lang.String url)

getUrlStrings

public static java.lang.String[] getUrlStrings(java.net.URL url,
                                               java.lang.String uid,
                                               java.lang.String pw)
                                        throws java.io.IOException
Use basic authentication in order to retrieve the url as an array of strings.

Parameters:
url - the url (perhaps with cgi arguments)
uid - a required uid for access
pw - a base 64 encoded, insecure password to be passed (careful!)
Returns:
The HTML content, on line at a time.
Throws:
java.io.IOException - if the url cannot be opened

testEdgarExtract

public static void testEdgarExtract()
                             throws java.io.IOException
Throws:
java.io.IOException

textStreamProcessor

public static void textStreamProcessor(java.net.URL u,
                                       LineProcessor lineProcessor)
                                throws java.io.IOException
Throws:
java.io.IOException

encode

public static java.lang.String encode(java.lang.String s)

main

public static void main(java.lang.String[] args)
                 throws java.io.IOException
Throws:
java.io.IOException

dataMineCTC

public static void dataMineCTC()
                        throws java.io.IOException
Throws:
java.io.IOException