John W. Colby
jwcolby at colbyconsulting.com
Fri Feb 18 10:14:58 CST 2005
Public Sub ParseHTML(sURL As String)
'# ---------------------------------------------------------------- #'
'# Procedure: ParseHTML #'
'# Written By: David Scott, Memphis Technology Group #'
'# Date Written: 08/20/1998 #'
'# Accepts: sURL - URL to examine #'
'# Returns: Nothing #'
'# References: MSHTML - Microsoft HTML Object Library #'
'# SHDocVw - Microsoft Internet Controls #'
'# Description: Opens an IE window, retrieves current document #'
'# and cycles through each element of the document #'
'# , printing to the debug window some of the ess- #'
'# ential properties of the element. For a complete#'
'# property listing check the object browser #'
'# ---------------------------------------------------------------- #'
On Error Resume Next '# SOME PROPERTIES ARE NOT
PRESENT
'# IN ALL ITEMS' PROPERTIES
Collection
'# AN ERROR WILL BE GENERATED
Dim SWs As New SHDocVw.ShellWindows '# INSTANCES OF EXPLORER
Dim IE As New SHDocVw.InternetExplorer '# OBJECT REFERENCE TO NEW
INSTANCE OF IE
Dim Doc As MSHTML.HTMLDocument '# IE DOCUMENT REFERENCE
Dim iItem As Integer '# CURRENT ITEM IN ITEM
Collection
'# OF DOCUMENT
Dim iItemCnt As Integer '# TOTAL NUMBER OF ITEMS IN ITEM
'# COLLECTION OF DOCUMENT
Dim itm '# IE ITEM OBJECT REFERENCE
Dim iFrm As Integer '# CURRENT FRAME IN FRAMES
Collection
'# OF DOCUMENT
Dim ifrmcnt As Integer '# TOTAL NUMBER OF FRAMES IN
Frame
'# COLLECTION OF DOCUMENT
Dim frm '# IE FRAME OBJECT REFERENCE
Dim strHTML As String
IE.Visible = True '# SHOW INSTANCE OF IE
IE.Offline = False '# MAKE SURE BROWSER ISN'T
WORKING OFFLINE
IE.navigate sURL '# NAVIGATE TO THAT AWESOME
Access SITE
If SWs.Count = 0 Then Exit Sub '# IF NO INSTANCE OF IE IS OPEN,
THEN
'# THERE'S NO NEED TO CONTINUE
Do Until IE.Busy = False '# WAIT UNTIL BROWSER IS
FINISHED DOWN-
Loop '# LOADING PAGE
Do Until IE.Document.readyState = "complete"
Loop
Set Doc = IE.Document '# SET CURRENT DOCUMENT
'# IF THE DOCUMENT LOADED IS A TRUE HTML DOCUMENT (TYPE IS DEFINED
'# BY MSHTML LIBRARY) THEN CONTINUE, IF NOT, THEN DO NOTHING
If TypeOf Doc Is HTMLDocument Then
'# PRINT DOCUMENT TITLE
ifrmcnt = Doc.Frames.Length '# RETREIVE NUMBER OF FRAMES IN
Document
If ifrmcnt = 0 Then '# IF NO FRAMES, THEN PRINT
ITEMS OF CURRENT DOCUMENT
GoSub IterateItem
Else
For iFrm = 0 To ifrmcnt - 1 '# CYCLE THROUGH EACH FRAME
LISTED IN DOCUMENT
GoSub IterateItem
Next
End If
End If
IE.Quit '# QUIT IE
'# CLEANUP
If Not itm Is Nothing Then Set itm = Nothing
If Not frm Is Nothing Then Set frm = Nothing
If Not Doc Is Nothing Then Set Doc = Nothing
If Not IE Is Nothing Then Set IE = Nothing
Exit Sub
IterateItem:
If ifrmcnt = 0 Then '# IF NO FRAMES, THEN PRINT ITEMS OF
CURRENT DOCUMENT
iItemCnt = Doc.all.Length
Else
iItemCnt = frm.Document.all.Length
Set frm = Doc.Frames(iFrm)
End If
For iItem = 0 To iItemCnt - 1
If ifrmcnt = 0 Then '# IF NO FRAMES, THEN PRINT ITEMS
OF CURRENT DOCUMENT
Set itm = Doc.all.Item(iItem)
Else
Set itm = frm.Document.all.Item(iItem)
End If
Select Case itm.tagName
Case "A", "HTML", "HEAD", "TITLE"
Case "BODY"
strHTML = strHTML & "InnerHTML " & iItem & ": " & itm.innerHTML
If InStr(strHTML, "ad.exe?") > 0 Then
Dim str As String
str = Right$(strHTML, Len(strHTML) - InStr(strHTML,
"/cgi-cls/ad.exe?"))
str = Left$(str, InStr(str, "<table"))
If Len(str) > 0 Then GrabJobURLs str
Else
strHTML = ""
End If
Case Else
End Select
' strHTML = "Tagname " & iItem & ": " & itm.tagName
' strHTML = strHTML & "Text " & iItem & ": " & itm.Text
' strHTML = strHTML & "InnerHTML " & iItem & ": " & itm.innerHTML
' strHTML = strHTML & "InnerText " & iItem & ": " & itm.innerText
' strHTML = strHTML & "HRef " & iItem & ": " & itm.href
' If InStr(strHTML, "ad.exe?") > 0 Then
' Dim str As String
' str = Right$(strHTML, Len(strHTML) - InStr(strHTML,
"/cgi-cls/ad.exe?"))
' str = Left$(str, InStr(str, "<table"))
' If Len(str) > 0 Then GrabJobURLs str
' Else
' strHTML = ""
' End If
' If InStr(strHTML, "Next Page") > 0 Then
' MsgBox "section found"
' End If
NextItem:
Next
Return
End Sub
John W. Colby
www.ColbyConsulting.com
Contribute your unused CPU cycles to a good cause:
http://folding.stanford.edu/