将链接存储在计算机内存中，而不是Excel工作表中

2024-04-29 • 问答

我正在尝试创建网络抓取器（又名网络抓取器）以从网站下载PDF文件。我想将所有PDF文件下载到C:\temp\。我目前有指向Excel工作表A1:A17上的子页面的链接。

使用以下代码将它们捕获到Excel工作表中：

Sub GetallLinks()

Dim internet As InternetExplorer
Dim internetdata As HTMLDocument
Dim internetlink As Object
Dim internetinnerlink As Object

Set internet = CreateObject("InternetExplorer.Application")

internet.Visible = False

internet.navigate ("https://www.nordicwater.com/products/waste-water/")

    Do While internet.Busy
      DoEvents
    Loop
    Do Until internet.readyState = READYSTATE_COMPLETE

    DoEvents

    Loop

        Set internetdata = internet.document

        Set internetlink = internetdata.getElementsByTagName("a")

        i = 1

        For Each internetinnerlink In internetlink


            If Left$(internetinnerlink,36) = "https://www.nordicwater.com/product/" Then

            activeSheet.Cells(i,1) = internetinnerlink.href

            i = i + 1

            Else
            End If


Next internetinnerlink

End Sub

文件下载代码：

Sub DownloadFiles()

    Dim xHttp: Set xHttp = CreateObject("microsoft.XMLHTTP")
    Dim bStrm
    Dim hDoc As MSHTML.HTMLDocument
    Dim hAnchor As MSHTML.HTMLAnchorElement
    Dim sPath As String
    Dim i As Long
    Dim wholeURL
    Dim link
    Dim range

    range = ThisWorkbook.Worksheets("Sheet1").range("A1:A17")

    wholeURL = "URL URL URL"

    sPath = "C:\temp\"

    For Each link In range

    'Get the directory listing
    xHttp.Open "GET",link
    xHttp.send

    'Wait for the page to load
    Do Until xHttp.readyState = 4
        DoEvents
    Loop

    'Put the page in an HTML document
    Set hDoc = New MSHTML.HTMLDocument
    hDoc.body.innerHTML = xHttp.responseText

    'Loop through the hyperlinks on the directory listing
    For i = 0 To hDoc.getElementsByTagName("a").Length - 1
        Set hAnchor = hDoc.getElementsByTagName("a").Item(i)

        'test the pathname to see if it matches your pattern
        If hAnchor.pathname Like "*.pdf" Then

                Debug.Print wholeURL & hAnchor.pathname

                xHttp.Open "GET",wholeURL & hAnchor.pathname,False
                xHttp.send

                Set bStrm = CreateObject("Adodb.Stream")

                With bStrm
                    .Type = 1 '//binary
                    .Open
                    .write xHttp.responseBody
                    .SaveToFile sPath & getName(wholeURL & hAnchor.pathname),2 '//overwrite
                End With

                Set bStrm = Nothing

        End If

    Next i

    Next

End Sub

从网址获取文件名的功能：

Function getName(pf)
getName = Split(pf,"/")(UBound(Split(pf,"/")))
End Function

wholeURL =“ URL URL URL”：

A1：A17：

等

如何将这些代码连接在一起，这样就无需使用Excel Worksheet作为链接数据库并将链接存储在计算机内存中了？

编辑：

Sub DownloadFiles()
    Dim xHttp       As Object: Set xHttp = CreateObject("microsoft.XMLHTTP")
    Dim hDoc        As MSHTML.HTMLDocument
    Dim Anchors     As Object
    Dim Anchor      As Variant
    Dim sPath       As String
    Dim wholeURL    As String

    Dim internet As InternetExplorer
    Dim internetdata As HTMLDocument
    Dim internetlink As Object
    Dim internetinnerlink As Object
    Dim arrLinks As Variant
    Dim sLink As String
    Dim iLinkCount As Integer
    Dim iCounter As Integer
    Dim sLinks As String

    Set internet = CreateObject("InternetExplorer.Application")
    internet.Visible = False
    internet.navigate ("https://www.nordicwater.com/products/waste-water/")

        Do While internet.Busy
          DoEvents
        Loop
        Do Until internet.readyState = READYSTATE_COMPLETE
            DoEvents
        Loop

        Set internetdata = internet.document
        Set internetlink = internetdata.getElementsByTagName("a")

        i = 1

        For Each internetinnerlink In internetlink
            If Left$(internetinnerlink,36) = "https://www.nordicwater.com/product/" Then

                sLinks = sLinks & internetinnerlink.href & vbCrLf
                i = i + 1

            Else
            End If

    ThisWorkbook.Worksheets("Sheet1").range("B1").Value = sLinks

    Next internetinnerlink

    wholeURL = "https://www.nordicwater.com/"
    sPath = "C:\temp\"

    arrLinks = Split(p_sLinks,vbCrLf)
    iLinkCount = UBound(arrLinks) + 1

    For iCounter = 1 To iLinkCount
    sLink = arrLinks(iCounter - 1)
        'Get the directory listing
        xHttp.Open "GET",sLink
        xHttp.send

        'Wait for the page to load
        Do Until xHttp.readyState = 4
            DoEvents
        Loop

        'Put the page in an HTML document
        Set hDoc = New MSHTML.HTMLDocument
        hDoc.body.innerHTML = xHttp.responseText

        'Loop through the hyperlinks on the directory listing
        Set Anchors = hDoc.getElementsByTagName("a")

        For Each Anchor In Anchors

            'test the pathname to see if it matches your pattern
            If Anchor.pathname Like "*.pdf" Then

                xHttp.Open "GET",wholeURL & Anchor.pathname,False
                xHttp.send

                With CreateObject("Adodb.Stream")
                    .Type = 1
                    .Open
                    .write xHttp.responseBody
                    .SaveToFile sPath & getName(wholeURL & Anchor.pathname),2 '//overwrite
                End With

            End If

        Next

    Next

End Sub
Function getName(pf As String) As String
    getName = Split(pf,"/")))
End Function

eer05067 回答：将链接存储在计算机内存中，而不是Excel工作表中

代码中有几个错误，下面已更正。您需要创建一个新的ADODB.Stream对象，或者确保关闭上一个对象。另外，您应该尽可能强地键入变量。我清理了几个地方。

Function getName(pf As String) As String
    getName = Split(pf,"/")(UBound(Split(pf,"/")))
End Function


Sub DownloadFiles()
    Dim xHttp       As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
    Dim hDoc        As MSHTML.HTMLDocument
    Dim Anchors     As Object
    Dim Anchor      As Variant
    Dim sPath       As String
    Dim wholeURL    As String
    Dim link        As range
    Dim range       As range

    Set range = ThisWorkbook.Worksheets("Sheet1").range("A1:A5")
    wholeURL = "https://www.nordicwater.com/"
    sPath = "C:\temp\"

    For Each link In range
        'Get the directory listing
        xHttp.Open "GET",link
        xHttp.send

        'Wait for the page to load
        Do Until xHttp.readyState = 4
            DoEvents
        Loop

        'Put the page in an HTML document
        Set hDoc = New MSHTML.HTMLDocument
        hDoc.body.innerHTML = xHttp.responseText

        'Loop through the hyperlinks on the directory listing
        Set Anchors = hDoc.getElementsByTagName("a")

        For Each Anchor In Anchors

            'test the pathname to see if it matches your pattern
            If Anchor.pathname Like "*.pdf" Then

                xHttp.Open "GET",wholeURL & Anchor.pathname,False
                xHttp.send

                With CreateObject("Adodb.Stream")
                    .Type = 1
                    .Open
                    .Write xHttp.responseBody
                    .SaveToFile sPath & getName(wholeURL & Anchor.pathname),2 '//overwrite
                End With

            End If

        Next

    Next

End Sub

您可以将所有链接存储在字符串中，并用vbCrLf分隔每个链接，然后使用Split(yourstring,vbCrLf)获取链接数组。这样，您就不需要在Excel中运行此程序，或者至少您不需要使用Excel单元格。

为此，创建一个字符串变量，例如sLinks。然后，在您的第一个循环中，替换

ActiveSheet.Cells(i,1) = internetinnerlink.href

使用

sLinks = sLinks & internetinnerlink.href & vbCrLf

完成此操作后，您无需再将链接存储在Excel中。然后，您可以将此字符串作为参数传递给DownloadFiles子：

Sub DownloadFiles(p_sLinks)
    Dim arrLinks As Variant
    Dim sLink As String
    Dim iLinkCount As Integer
    Dim iCounter As Integer

    arrLinks = Split(p_sLinks,vbCrLf)
    iLinkCount = UBound(arrLinks) + 1

    For iCounter = 1 to iLinkCount
        sLink = arrLinks(iCounter - 1)
        ' Process sLink here
    Next

End Sub

您可以将此代码与现有的DownloadFiles子目录合并，将For Each link In range循环替换为For iCounter = 1 to iLinkCount，将循环中的代码放入新循环中，并使用sLink作为链接处理而不是从Excel中读取。

您可以将一些代码分成Subs，以使其更易于阅读和排除故障：

Sub DownloadFile(p_sURL,p_sLocalPath)
    Dim xHttp As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")

    xHttp.Open "GET",p_sURL,False
    xHttp.send

    With CreateObject("Adodb.Stream")
        .Type = 1
        .Open
        .write xHttp.responseBody
        .SaveToFile p_sLocalPath & getName(p_sURL),2 ' //overwrite
    End With

End Sub

excel vba web-crawler web-scraping

本文链接：https://www.f2er.com/3129195.html

将链接存储在计算机内存中，而不是Excel工作表中

eer05067 回答：将链接存储在计算机内存中，而不是Excel工作表中

大家都在问