Right now, I am doing some simple web scraping, for example get the current train arrival/departure information for one railway station. Here is the example link, http://www.thetrainline.com/Live/arrivals/chester, from this link you can visit the current arrival trains in the chester station.
I am using the node.js request module to do some simple web scraping,
app.get('/railway/arrival', function (req, res, next) {
console.log("/railway/arrival/ "+req.query["city"]);
var city = req.query["city"];
if(typeof city == undefined || city == undefined) { console.log("if it undefined"); city ="liverpool-james-street";}
getRailwayArrival(city,
function(err,data){
res.send(data);
}
);
});
function getRailwayArrival(station,callback){
request({
uri: "http://www.thetrainline.com/Live/arrivals/"+station,
}, function(error, response, body) {
var $ = cheerio.load(body);
var a = new Array();
$(".results-contents li a").each(function() {
var link = $(this);
//var href = link.attr("href");
var due = $(this).find('.due').text().replace(/(\r\n|\n|\r|\t)/gm,"");
var destination = $(this).find('.destination').text().replace(/(\r\n|\n|\r|\t)/gm,"");
var on_time = $(this).find('.on-time-yes .on-time').text().replace(/(\r\n|\n|\r|\t)/gm,"");
if(on_time == undefined) var on_time_no = $(this).find('.on-time-no').text().replace(/(\r\n|\n|\r|\t)/gm,"");
var platform = $(this).find('.platform').text().replace(/(\r\n|\n|\r|\t)/gm,"");
var obj = new Object();
obj.due = due;obj.destination = destination; obj.on_time = on_time; obj.platform = platform;
a.push(obj);
console.log("arrival ".green+due+" "+destination+" "+on_time+" "+platform+" "+on_time_no);
});
console.log("get station data "+a.length +" "+ $(".updated-time").text());
callback(null,a);
});
}
The code works by giving me a list of data, however these data are different from the data seen in the browser, though the data come from the same url. I don't know why it is like that. is it because that their server can distinguish the requests sent from server and browser, that if the request is from server, so they sent me the wrong data. How can I overcome this problem ?
thanks in advance.
2 Answers
They must have stored session per click event. Means if u visit that page first time, it will store session and validate that session for next action you perform. Say, u select some value from drop down list. for that click again new value of session is generated that will load data for ur selected combobox value. then u click on show list then that previous session value is validated and you get accurate data.
Now see, if you not catch that session value programatically and not pass as parameter with that request, you will get default loaded data or not get any thing. So, its chalenging for you to chatch that data.Use firebug for help.
Another issue here could be that the generated content occurs through JavaScript run on your machine. jsdom is a module which will provide such content but is not as lightweight.
Cheerio does not execute these scripts and as a result content may not be visible (as you're experiencing). This is an article I read a while back and caused me to have the same discovery, just open the article and search for "jsdom is more powerful" for a quick answer:
Source:http://stackoverflow.com/questions/15785360/data-from-web-scraping-using-node-js-request-is-different-from-data-shown-in-the?rq=1
I am using the node.js request module to do some simple web scraping,
app.get('/railway/arrival', function (req, res, next) {
console.log("/railway/arrival/ "+req.query["city"]);
var city = req.query["city"];
if(typeof city == undefined || city == undefined) { console.log("if it undefined"); city ="liverpool-james-street";}
getRailwayArrival(city,
function(err,data){
res.send(data);
}
);
});
function getRailwayArrival(station,callback){
request({
uri: "http://www.thetrainline.com/Live/arrivals/"+station,
}, function(error, response, body) {
var $ = cheerio.load(body);
var a = new Array();
$(".results-contents li a").each(function() {
var link = $(this);
//var href = link.attr("href");
var due = $(this).find('.due').text().replace(/(\r\n|\n|\r|\t)/gm,"");
var destination = $(this).find('.destination').text().replace(/(\r\n|\n|\r|\t)/gm,"");
var on_time = $(this).find('.on-time-yes .on-time').text().replace(/(\r\n|\n|\r|\t)/gm,"");
if(on_time == undefined) var on_time_no = $(this).find('.on-time-no').text().replace(/(\r\n|\n|\r|\t)/gm,"");
var platform = $(this).find('.platform').text().replace(/(\r\n|\n|\r|\t)/gm,"");
var obj = new Object();
obj.due = due;obj.destination = destination; obj.on_time = on_time; obj.platform = platform;
a.push(obj);
console.log("arrival ".green+due+" "+destination+" "+on_time+" "+platform+" "+on_time_no);
});
console.log("get station data "+a.length +" "+ $(".updated-time").text());
callback(null,a);
});
}
The code works by giving me a list of data, however these data are different from the data seen in the browser, though the data come from the same url. I don't know why it is like that. is it because that their server can distinguish the requests sent from server and browser, that if the request is from server, so they sent me the wrong data. How can I overcome this problem ?
thanks in advance.
2 Answers
They must have stored session per click event. Means if u visit that page first time, it will store session and validate that session for next action you perform. Say, u select some value from drop down list. for that click again new value of session is generated that will load data for ur selected combobox value. then u click on show list then that previous session value is validated and you get accurate data.
Now see, if you not catch that session value programatically and not pass as parameter with that request, you will get default loaded data or not get any thing. So, its chalenging for you to chatch that data.Use firebug for help.
Another issue here could be that the generated content occurs through JavaScript run on your machine. jsdom is a module which will provide such content but is not as lightweight.
Cheerio does not execute these scripts and as a result content may not be visible (as you're experiencing). This is an article I read a while back and caused me to have the same discovery, just open the article and search for "jsdom is more powerful" for a quick answer:
Source:http://stackoverflow.com/questions/15785360/data-from-web-scraping-using-node-js-request-is-different-from-data-shown-in-the?rq=1
No comments:
Post a Comment