I’m trying to get the content of a HTML page with a Node.js app.
I found this code: In Node.js / Express, how do I "download" a page and gets its HTML? (yojimbo answer), which seems to work well. When I try to start the code, I get the HTML result of 301 Moved Permanently, nut the redirect link is the same as the one I sent!
var util = require("util"),
http = require("http");
var options = {
host: "www.mylink.com",
port: 80,
path: "/folder/content.xml"
};
var content = "";
var req = http.request(options, function(res) {
res.setEncoding("utf8");
res.on("data", function (chunk) {
content += chunk;
});
res.on("end", function () {
util.log(content);
});
});
req.end();
30 Jul 13:08:52 - <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>301 Moved Permanently</title>
</head><body>
<p>The document has moved <a href="http://mylink.com/folder/content.xml"<here</a>.</p>
<hr>
<adress>Apache/2.2.22 (Ubuntu) Server at www.mylink.com Port 80</adress>
</body></html>
A 301 status code indicates the requested resource has been moved and that a client must perform a redirect to the link included in the response's Location header. The http
module doesn't follow redirects(status codes 3xx) by default.
You can use the request module, which is said to perform redirects.
Request is designed to be the simplest way possible to make http calls. It supports HTTPS and follows redirects by default.
To do it manually, read the Location header from the response and initiate a new request to that URI.
var req = http.request(options, function(res) {
res.setEncoding("utf8");
if(res.statusCode === 301 || res.statusCode === 302) {
var newRequestUri = res.headers.location;
http.request({hostname: newRequestUri}, function(res) {
//read response
}
}
res.on("data", function (chunk) {
content += chunk;
});
res.on("end", function () {
util.log(content);
});
});