javascript CasperJS 循环或迭代多个网页?
声明:本页面是StackOverFlow热门问题的中英对照翻译,遵循CC BY-SA 4.0协议,如果您需要使用它,必须同样遵循CC BY-SA许可,注明原文地址和作者信息,同时你必须将它归于原作者(不是我):StackOverFlow
原文地址: http://stackoverflow.com/questions/23384963/
Warning: these are provided under cc-by-sa 4.0 license. You are free to use/share it, But you must attribute it to the original authors (not me):
StackOverFlow
CasperJS loop or iterate through multiple web pages?
提问by karansolo
I have a CasperJS script that scrapes ratings and dates from one webpage. Now I want to scrape the same data from multiple pages under the same website. How can I loop through the different subpages given this code:
我有一个 CasperJS 脚本,可以从一个网页中抓取评级和日期。现在我想从同一个网站下的多个页面中抓取相同的数据。给定此代码,我如何遍历不同的子页面:
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', function(){
this.echo('hi');
});
casper.then(function() {
ratings = this.evaluate(getRatings);
dates = this.evaluate(getDate);
this.echo(ratings);
});
casper.run(function() {
this.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
this.echo(ratings);
var content = ratings;
content = content.join("\n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'w');
this.echo(dates.length + ' dates found:').exit();
});
Any help is appreciated :)
任何帮助表示赞赏:)
回答by Artjom B.
Since there exists a next page button, you can use it to traverse all pages recursively:
由于存在下一页按钮,您可以使用它来递归遍历所有页面:
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
casper.echo(ratings);
var content = ratings;
content = content.join("\n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm');
casper.then(getRatingsAndWrite);
casper.run();
A related answer is A: CasperJS parse next page after button click.
回答by Fanch
This code can help you : you define in an array of objects the wanted urls, selectors for each page and in a loop you do what you want to do with these properties.
这段代码可以帮助你:你在一个对象数组中定义想要的 url、每个页面的选择器,然后在循环中用这些属性做你想做的事情。
You can use a click method in the loop instead of url too.
您也可以在循环中使用 click 方法而不是 url 。
var navigation = [
{
url: 'http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
,{
url: 'yourSecondUrl, etc...',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img',
selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
],
content = "";
casper.start()
.then(function(){
//loop on the array
navigation.forEach(function(navIndex){
//open url : property url
casper.thenOpen(navIndex.url)
//wait for the page to load -> must be useless because thenOpen() do it
.waitForUrl(navIndex.url, function(){
//get the value of attribute title of adequate selector
var ratings = this.getElementAttribute(navIndex.selectorRatings, 'title'),
//get the HTML of adequate selector
var dates = this.getHTML(navIndex.selectorDates);
this.echo(ratings);
this.echo(dates);
content = content + ' ' + ratings + ' ' + dates;
});
});
})
.run(function() {
this.echo('----------- All steps done ------------\n');
this.exit();
});
回答by karansolo
Thanks Fanch and Artjom B. Both of your answers rendered the working solution. I used the recursive walk through the 'next' pages on the pagination as given by Artjom B. Next, I added a wait() function to make sure the next ratings page was loaded before scraping them. Without this wait() function, we scrape the same page multiple times between the instant that 'next' is clicked and the resp. next page is done loading. See the working code below:
感谢 Fanch 和 Artjom B。您的两个答案都提供了有效的解决方案。我使用了 Artjom B 给出的分页上“下一个”页面的递归遍历。接下来,我添加了一个 wait() 函数以确保在抓取它们之前加载下一个评级页面。如果没有这个 wait() 函数,我们会在“下一步”被点击和响应之间多次抓取同一个页面。下一页加载完毕。请参阅下面的工作代码:
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
var rating = ratings[i].substr(0,1);
ratings[i] = rating +': '+dates[i];
dates[i] = '';
}
var content = ratings;
content = content.join("\n");
fs.write("<filepath to write content>", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.wait(3000);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/htc-one-m8.html');
casper.then(getRatingsAndWrite);
casper.run();