Nb_me Nb_me - 1 month ago 9
HTML Question

remove the whitespace from web-scraping <tr> tags nodejs

I have a problem that is beyond me to solve. I am web-scraping a webpage more specific -its in nodejs, and am able to grab the content but for some reason the format is not ideal. It has a bunch of white-spaces when viewed in the console.log(). I tried the .trim() and .replace() but it did not really remove the spaces.I am guessing it is because the data was nested in the div and table format. I don't know how to deal with it.

Also I tried to save it as array but all the output is not broken up it is just one block of data. If there is a answer to this question already please provide the link and disregard my post.

Here is a copy of the code

var request = require('request');
var cheerio = require('cheerio');
var URL = 'http://www.hcad.org';

var content = [];
var Tr = [];
request(URL, function(error, response,html){
if(error){
console.log('Error happened: ', error);
}
if (response.statusCode !== 200) {
console.log('Invaled response code returned: ', response.statusCode);
}
var $ = cheerio.load(html);


$('tr').each(function (i, element) {

content = [];
var a = $(this).prev();
var trimmed_a = a.text();

trimmed_a = trimmed_a.trim();
var str = trimmed_a.replace(/(\r\n|\n|\r|\t)/gm, " ");
var newStr = str.replace(/[^\x20-\x7E]/gmi, "");;

content.push(newStr.trim());
console.log(newStr.trim());
});



})

Answer

I think you are missing replace using whitespace as suggested in Regex to replace multiple spaces with a single space:

string = string.replace(/\s\s+/g, ' ');

See sample for a cut-down version using jQuery.