Javascript 如何使用 phantomjs 抓取链接

Question

提问by john mangual

Can PhantomJSbe used an an alternative to BeautifulSoup?

I am trying to search on Etsy and visit all the links in term. In Python, I know how to do this (with BeautifulSoup) but today I want to see if I can do the same with PhantomJS. I'm not getting very far.

我正在尝试在 Etsy 上搜索并访问所有链接。在 Python 中，我知道如何做到这一点（使用 BeautifulSoup），但今天我想看看我是否可以使用 PhantomJS 做到这一点。我不会走得很远。

This script should search "hello kitty" on Etsy and return all the of products <a class="listing-thumb" href=...></a>and print them in the console. Ideally I'd visit them later on and get the information I need. Right now it just freezes. Any ideas?

该脚本应该在 Etsy 上搜索“hello kitty”并返回所有产品 <a class="listing-thumb" href=...></a>并在控制台中打印它们。理想情况下，我稍后会访问他们并获取我需要的信息。现在它只是冻结。有任何想法吗？

var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';

page.open(url, function(status){
    // list all the a.href links in the hello kitty etsy page
    var link = page.evaluate(function() {
        return document.querySelectorAll('a.listing-thumb');
    });
    for(var i = 0; i < link.length; i++){ console.log(link[i].href); }
    phantom.exit();
});

I have toyed with using CasperJS, which may be better designed for this.

我曾尝试使用CasperJS，这可能是为此设计的更好。

Answer 1

回答by NiKo

PhantomJS evaluate()cannot serialize and return complex objects like HTMLElements or NodeLists, so you have to map them to serializable things before:

PhantomJSevaluate()不能序列化和返回像 HTMLElements 或 NodeLists 这样的复杂对象，所以你必须在之前将它们映射到可序列化的东西：

var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';

page.open(url, function(status) {
    // list all the a.href links in the hello kitty etsy page
    var links = page.evaluate(function() {
        return [].map.call(document.querySelectorAll('a.listing-thumb'), function(link) {
            return link.getAttribute('href');
        });
    });
    console.log(links.join('\n'));
    phantom.exit();
});

Note: here we use [].map.call()in order to treat a NodeListas a standard Array.

注意：这里我们使用[].map.call()是为了将 aNodeList视为标准Array.

Answer 2

回答by Darius Kucinskas

The only problem with your code is that you do not understand phantomjs scopes. You have phantom and page scopes. You tried to return JavaScript DOM object references (those can't be serialized) from page scope (page.evaluate runs in page scope) to phantom main scope. I think that is not possible. Here follows code that works:

您的代码唯一的问题是您不了解 phantomjs 范围。您有幻像和页面范围。您试图将 JavaScript DOM 对象引用（那些无法序列化的）从页面范围（page.evaluate 在页面范围内运行）返回到幻像主范围。我认为这是不可能的。以下是有效的代码：

var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';

// for debug (to see if page returns status code 200)
page.onResourceReceived = function(response) {
    if (response.url === url) {
        console.log('Resorce: "' + response.url + '" status: '  + response.status);

        if (response.status === 200) {
            console.log(response.url);
            for (var i = 0; i < response.headers.length; i++) {
                console.log(response.headers[i].name + ': ' + response.headers[i].value);
            }
        }
    }
};

page.onLoadFinished = function(status){
    console.log('Status: ' + status);

    console.log('Starting evaluate...');
    var links = page.evaluate(function() {
        var nodes = [],
            matches = document.querySelectorAll("a.listing-thumb");

            for(var i = 0; i < matches.length; ++i) {
                nodes.push(matches[i].href);
            }

            return nodes;
    });
    console.log('Done evaluate... count: ' + links.length);

    if (links && links.length > 0) {
        for(var i = 0; i < links.length; ++i) {
            console.log('(' + i + ') ' + links[i]);
        }
    } else {
        console.log("No match found!");
    }

    phantom.exit(0);
};

page.open(url);

Answer 3

回答by Darius Kucinskas

Here is some code I recently wrote that scrapes urls using PhantomJs, if you provide only a URL it will display all URLS's on the page, if you supply an argument of class|idfollowed by a "class/id name" it will display the urls of the class/id only.

这是我最近编写的一些使用 PhantomJs 抓取 url 的代码，如果您只提供一个 URL，它将显示页面上的所有 URL，如果您提供class|id后跟“类/ID 名称”的参数，它将显示仅类/ID。

////////////////////////////////////////////////////////// 
/////  PhantomJS URL Scraper v.1.3 ///// 
// 
// Copyrighted by +A.M.Danischewski  2016+ (c)
// This program may be reutilized without limits, provided this 
// notice remain intact. 
// 
// Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]
//
//   Argument 1: URL -- "https://www.youtube.com/watch?v=8TniRMwL2Vg" 
//   Argument 2: "class" or "id" 
//   Argument 3: If Argument 2 was provided, "class name" or "id name" 
// 
// By default this program will display ALL urls from a user supplied URL.  
// If a class name or id name is provided then only URL's from the class 
// or id are displayed.  
//  
/////////////////////////////////// 

var page = require('webpage').create(), 
    system = require('system'),
    address;

if (system.args.length === 1) {
  console.log(' Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]');
  phantom.exit();
}

address = system.args[1];
querytype= system.args[2];
queryclass = system.args[3];
page.open(address, function(status) {
  if (status !== 'success') {
    console.log('Error loading address: '+address);
  } else {
   //console.log('Success! In loading address: '+address);   
  }
});

page.onConsoleMessage = function(msg) {
  console.log(msg);
}

page.onLoadFinished = function(status) {
   var dynclass="function() { window.class_urls = new Array(); window.class_urls_next=0; var listings = document.getElementsByClassName('"+queryclass+"'); for (var i=0; i < listings.length; i++) { var el = listings[i]; var ellnks=[].map.call(el.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=el.innerHTML; window.class_urls.push(ellnks.join('\n')); }; return window.class_urls;}"; 
   var    dynid="function() { window.id_urls = new Array(); window.id_urls_next=0; var listings = document.getElementById('"+queryclass+"'); var ellnks=[].map.call(listings.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=listings.innerHTML; window.id_urls.push(ellnks.join('\n'));  return window.id_urls;}";  
   var  allurls="function() { var links = page.evaluate(function() { return [].map.call(document.querySelectorAll('a'), function(link) { return link.getAttribute('href'); };); };); console.log(links.join('\n')); }"; 
   var page_eval_function="";  
   if (querytype === "class") {
   console.log(page.evaluate(dynclass).toString().replace(/,/g, "\n")); 
   } else if (querytype === "id") {
   console.log(page.evaluate(dynid).toString().replace(/,/g, "\n")); 
   } else { 
   var links = page.evaluate(function() {
        return [].map.call(document.querySelectorAll('a'), function(link) {
            return link.getAttribute('href');
        });
    });    
       console.log(links.join('\n'));
   }             
   phantom.exit();
};

Javascript 如何使用 phantomjs 抓取链接

提问by john mangual

回答by NiKo

回答by Darius Kucinskas

回答by Darius Kucinskas

相关推荐

最近更新

标签

Javascript 如何使用 phantomjs 抓取链接

提问by john mangual

回答by NiKo

回答by Darius Kucinskas

回答by Darius Kucinskas

相关推荐

Javascript 从json对象获取属性键

Javascript 为什么会引入新的 JSLint 错误“使用空格，而不是制表符”和“不安全字符”？

Javascript 为多个元素添加事件监听器

Javascript 获取jquery中所有具有相同名称属性的文本框的值

相关推荐

最近更新

标签