我正在尝试创建网络抓取器(又名网络抓取器)以从网站下载PDF文件。我想将所有PDF文件下载到C:\temp\
。我目前有指向Excel工作表A1:A17
上的子页面的链接。
使用以下代码将它们捕获到Excel工作表中:
Sub GetallLinks()
Dim internet As InternetExplorer
Dim internetdata As HTMLDocument
Dim internetlink As Object
Dim internetinnerlink As Object
Set internet = CreateObject("InternetExplorer.Application")
internet.Visible = False
internet.navigate ("https://www.nordicwater.com/products/waste-water/")
Do While internet.Busy
DoEvents
Loop
Do Until internet.readyState = READYSTATE_COMPLETE
DoEvents
Loop
Set internetdata = internet.document
Set internetlink = internetdata.getElementsByTagName("a")
i = 1
For Each internetinnerlink In internetlink
If Left$(internetinnerlink,36) = "https://www.nordicwater.com/product/" Then
activeSheet.Cells(i,1) = internetinnerlink.href
i = i + 1
Else
End If
Next internetinnerlink
End Sub
文件下载代码:
Sub DownloadFiles()
Dim xHttp: Set xHttp = CreateObject("microsoft.XMLHTTP")
Dim bStrm
Dim hDoc As MSHTML.HTMLDocument
Dim hAnchor As MSHTML.HTMLAnchorElement
Dim sPath As String
Dim i As Long
Dim wholeURL
Dim link
Dim range
range = ThisWorkbook.Worksheets("Sheet1").range("A1:A17")
wholeURL = "URL URL URL"
sPath = "C:\temp\"
For Each link In range
'Get the directory listing
xHttp.Open "GET",link
xHttp.send
'Wait for the page to load
Do Until xHttp.readyState = 4
DoEvents
Loop
'Put the page in an HTML document
Set hDoc = New MSHTML.HTMLDocument
hDoc.body.innerHTML = xHttp.responseText
'Loop through the hyperlinks on the directory listing
For i = 0 To hDoc.getElementsByTagName("a").Length - 1
Set hAnchor = hDoc.getElementsByTagName("a").Item(i)
'test the pathname to see if it matches your pattern
If hAnchor.pathname Like "*.pdf" Then
Debug.Print wholeURL & hAnchor.pathname
xHttp.Open "GET",wholeURL & hAnchor.pathname,False
xHttp.send
Set bStrm = CreateObject("Adodb.Stream")
With bStrm
.Type = 1 '//binary
.Open
.write xHttp.responseBody
.SaveToFile sPath & getName(wholeURL & hAnchor.pathname),2 '//overwrite
End With
Set bStrm = Nothing
End If
Next i
Next
End Sub
从网址获取文件名的功能:
Function getName(pf)
getName = Split(pf,"/")(UBound(Split(pf,"/")))
End Function
wholeURL =“ URL URL URL”:
A1:A17:
等
如何将这些代码连接在一起,这样就无需使用Excel Worksheet作为链接数据库并将链接存储在计算机内存中了?
编辑:
Sub DownloadFiles()
Dim xHttp As Object: Set xHttp = CreateObject("microsoft.XMLHTTP")
Dim hDoc As MSHTML.HTMLDocument
Dim Anchors As Object
Dim Anchor As Variant
Dim sPath As String
Dim wholeURL As String
Dim internet As InternetExplorer
Dim internetdata As HTMLDocument
Dim internetlink As Object
Dim internetinnerlink As Object
Dim arrLinks As Variant
Dim sLink As String
Dim iLinkCount As Integer
Dim iCounter As Integer
Dim sLinks As String
Set internet = CreateObject("InternetExplorer.Application")
internet.Visible = False
internet.navigate ("https://www.nordicwater.com/products/waste-water/")
Do While internet.Busy
DoEvents
Loop
Do Until internet.readyState = READYSTATE_COMPLETE
DoEvents
Loop
Set internetdata = internet.document
Set internetlink = internetdata.getElementsByTagName("a")
i = 1
For Each internetinnerlink In internetlink
If Left$(internetinnerlink,36) = "https://www.nordicwater.com/product/" Then
sLinks = sLinks & internetinnerlink.href & vbCrLf
i = i + 1
Else
End If
ThisWorkbook.Worksheets("Sheet1").range("B1").Value = sLinks
Next internetinnerlink
wholeURL = "https://www.nordicwater.com/"
sPath = "C:\temp\"
arrLinks = Split(p_sLinks,vbCrLf)
iLinkCount = UBound(arrLinks) + 1
For iCounter = 1 To iLinkCount
sLink = arrLinks(iCounter - 1)
'Get the directory listing
xHttp.Open "GET",sLink
xHttp.send
'Wait for the page to load
Do Until xHttp.readyState = 4
DoEvents
Loop
'Put the page in an HTML document
Set hDoc = New MSHTML.HTMLDocument
hDoc.body.innerHTML = xHttp.responseText
'Loop through the hyperlinks on the directory listing
Set Anchors = hDoc.getElementsByTagName("a")
For Each Anchor In Anchors
'test the pathname to see if it matches your pattern
If Anchor.pathname Like "*.pdf" Then
xHttp.Open "GET",wholeURL & Anchor.pathname,False
xHttp.send
With CreateObject("Adodb.Stream")
.Type = 1
.Open
.write xHttp.responseBody
.SaveToFile sPath & getName(wholeURL & Anchor.pathname),2 '//overwrite
End With
End If
Next
Next
End Sub
Function getName(pf As String) As String
getName = Split(pf,"/")))
End Function