我的用于爬网图像浏览器的代码在我的远程服务器上运行良好,但是当我检查了所有依赖项时,它在ec2-instance上没有爬网。这是我的相同代码。
Ec2实例是Ubuntu的。除此之外,所有其他文件在实例上都运行良好。我猜的问题是它无法打开该URL进行爬网。安装了Chromium浏览器以及运行代码所需的所有必要模块。请看看它,并帮助我进行调试。
const puppeteer = require('puppeteer');
const {Pool,Client} = require('pg')
const connectionString= "postgres://augli1234:augli1234@kamal1234.c5kamoli1el6.ap-south-1.rds.amazonaws.com:5432/augli";
const client= new Client({
connectionString:connectionString
})
client.connect()
var count=0;
var today = new Date();
var date = today.getFullYear()+'-'+(today.getMonth()+1)+'-'+today.getDate();
console.log(date);
//select distinct article_id from public.content_paraarticle
client.query("SELECT DISTINCT url,article_id FROM public.content_paraarticle where article_id IN(Select id from public.content_mainarticle where image_url='' AND date BETWEEN '2019-10-01' AND '"+date+"')",(async(err,res,fields)=>{
if (err) throw err;
for(var i=0;i<res.rows.length;i++)
{
var str1='.png';
count++;
console.log(count);
if(count>17)
return process.abort();
var id=(res.rows[i].article_id);
var str=id+str1;
console.log(str);
var url=(res.rows[i].url);
if(url.match(/thehindu.com/gi)){
console.log('Belongs to Hindu');
console.log(url);
try{
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.setViewport({ width: 1280,height: 926 });
let result
await Promise.race([
console.log('..........taken'),result = await page.goto(url,{waitUntil: 'networkidle2',}),new Promise(x => setTimeout(x,30000)),console.log('.........................'),]);
console.log(result.status());
const Image = await page.$('body > div.container-main > div.jscroll > div > div > div > section > div > div > div > div:nth-child(2) > div.lead-img-cont > div > picture > img');
console.log('screenshot started to get taken');
await Image.screenshot({
path: str,omitBackground: true,});
console.log('screenshot taken');
await browser.close();
}
catch(err){
console.log("err.message");
fs.appendFile("test.txt","\n"+str+"\n",function(err) {
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
continue;
await browser.close();
}
}
else if(url.match(/livemint/gi)){
console.log('Belongs to livemint');
console.log(url);
try{
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.setViewport({ width: 1280,]);
console.log(result.status());
const Image = await page.$('figure > img');
console.log('screenshot started to get taken');
await Image.screenshot({
path: str,function(err) {
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
continue;
await browser.close();
}
}
else {
console.log("Doesn't belongs to thehindu or livemint");
}
}
client.end()
}));
process.on('unhandledRejection',(reason,p) => {
console.log('Unhandled Rejection at: Promise',p,'reason:',reason);
// application specific logging,throwing an error,or other logic here
});