Javascript 如何使用 phantomjs 抓取链接
声明:本页面是StackOverFlow热门问题的中英对照翻译,遵循CC BY-SA 4.0协议,如果您需要使用它,必须同样遵循CC BY-SA许可,注明原文地址和作者信息,同时你必须将它归于原作者(不是我):StackOverFlow
原文地址: http://stackoverflow.com/questions/13944518/
Warning: these are provided under cc-by-sa 4.0 license. You are free to use/share it, But you must attribute it to the original authors (not me):
StackOverFlow
how to scrape links with phantomjs
提问by john mangual
Can PhantomJSbe used an an alternative to BeautifulSoup?
可以PhantomJS使用的一种替代BeautifulSoup?
I am trying to search on Etsy and visit all the links in term. In Python, I know how to do this (with BeautifulSoup) but today I want to see if I can do the same with PhantomJS. I'm not getting very far.
我正在尝试在 Etsy 上搜索并访问所有链接。在 Python 中,我知道如何做到这一点(使用 BeautifulSoup),但今天我想看看我是否可以使用 PhantomJS 做到这一点。我不会走得很远。
This script should search "hello kitty" on Etsy and return all the of products
<a class="listing-thumb" href=...></a>and print them in the console. Ideally I'd visit them later on and get the information I need. Right now it just freezes. Any ideas?
该脚本应该在 Etsy 上搜索“hello kitty”并返回所有产品
<a class="listing-thumb" href=...></a>并在控制台中打印它们。理想情况下,我稍后会访问他们并获取我需要的信息。现在它只是冻结。有任何想法吗?
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status){
// list all the a.href links in the hello kitty etsy page
var link = page.evaluate(function() {
return document.querySelectorAll('a.listing-thumb');
});
for(var i = 0; i < link.length; i++){ console.log(link[i].href); }
phantom.exit();
});
I have toyed with using CasperJS, which may be better designed for this.
我曾尝试使用CasperJS,这可能是为此设计的更好。
回答by NiKo
PhantomJS evaluate()cannot serialize and return complex objects like HTMLElements or NodeLists, so you have to map them to serializable things before:
PhantomJSevaluate()不能序列化和返回像 HTMLElements 或 NodeLists 这样的复杂对象,所以你必须在之前将它们映射到可序列化的东西:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status) {
// list all the a.href links in the hello kitty etsy page
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a.listing-thumb'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
phantom.exit();
});
Note: here we use [].map.call()in order to treat a NodeListas a standard Array.
注意:这里我们使用[].map.call()是为了将 aNodeList视为标准Array.
回答by Darius Kucinskas
The only problem with your code is that you do not understand phantomjs scopes. You have phantom and page scopes. You tried to return JavaScript DOM object references (those can't be serialized) from page scope (page.evaluate runs in page scope) to phantom main scope. I think that is not possible. Here follows code that works:
您的代码唯一的问题是您不了解 phantomjs 范围。您有幻像和页面范围。您试图将 JavaScript DOM 对象引用(那些无法序列化的)从页面范围(page.evaluate 在页面范围内运行)返回到幻像主范围。我认为这是不可能的。以下是有效的代码:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
// for debug (to see if page returns status code 200)
page.onResourceReceived = function(response) {
if (response.url === url) {
console.log('Resorce: "' + response.url + '" status: ' + response.status);
if (response.status === 200) {
console.log(response.url);
for (var i = 0; i < response.headers.length; i++) {
console.log(response.headers[i].name + ': ' + response.headers[i].value);
}
}
}
};
page.onLoadFinished = function(status){
console.log('Status: ' + status);
console.log('Starting evaluate...');
var links = page.evaluate(function() {
var nodes = [],
matches = document.querySelectorAll("a.listing-thumb");
for(var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
console.log('Done evaluate... count: ' + links.length);
if (links && links.length > 0) {
for(var i = 0; i < links.length; ++i) {
console.log('(' + i + ') ' + links[i]);
}
} else {
console.log("No match found!");
}
phantom.exit(0);
};
page.open(url);
回答by Darius Kucinskas
Here is some code I recently wrote that scrapes urls using PhantomJs, if you provide only a URL it will display all URLS's on the page, if you supply an argument of class|idfollowed by a "class/id name" it will display the urls of the class/id only.
这是我最近编写的一些使用 PhantomJs 抓取 url 的代码,如果您只提供一个 URL,它将显示页面上的所有 URL,如果您提供class|id后跟“类/ID 名称”的参数,它将显示仅类/ID。
//////////////////////////////////////////////////////////
///// PhantomJS URL Scraper v.1.3 /////
//
// Copyrighted by +A.M.Danischewski 2016+ (c)
// This program may be reutilized without limits, provided this
// notice remain intact.
//
// Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]
//
// Argument 1: URL -- "https://www.youtube.com/watch?v=8TniRMwL2Vg"
// Argument 2: "class" or "id"
// Argument 3: If Argument 2 was provided, "class name" or "id name"
//
// By default this program will display ALL urls from a user supplied URL.
// If a class name or id name is provided then only URL's from the class
// or id are displayed.
//
///////////////////////////////////
var page = require('webpage').create(),
system = require('system'),
address;
if (system.args.length === 1) {
console.log(' Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]');
phantom.exit();
}
address = system.args[1];
querytype= system.args[2];
queryclass = system.args[3];
page.open(address, function(status) {
if (status !== 'success') {
console.log('Error loading address: '+address);
} else {
//console.log('Success! In loading address: '+address);
}
});
page.onConsoleMessage = function(msg) {
console.log(msg);
}
page.onLoadFinished = function(status) {
var dynclass="function() { window.class_urls = new Array(); window.class_urls_next=0; var listings = document.getElementsByClassName('"+queryclass+"'); for (var i=0; i < listings.length; i++) { var el = listings[i]; var ellnks=[].map.call(el.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=el.innerHTML; window.class_urls.push(ellnks.join('\n')); }; return window.class_urls;}";
var dynid="function() { window.id_urls = new Array(); window.id_urls_next=0; var listings = document.getElementById('"+queryclass+"'); var ellnks=[].map.call(listings.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=listings.innerHTML; window.id_urls.push(ellnks.join('\n')); return window.id_urls;}";
var allurls="function() { var links = page.evaluate(function() { return [].map.call(document.querySelectorAll('a'), function(link) { return link.getAttribute('href'); };); };); console.log(links.join('\n')); }";
var page_eval_function="";
if (querytype === "class") {
console.log(page.evaluate(dynclass).toString().replace(/,/g, "\n"));
} else if (querytype === "id") {
console.log(page.evaluate(dynid).toString().replace(/,/g, "\n"));
} else {
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
}
phantom.exit();
};

