Matt Penfold Matt Penfold - 2 months ago 8
HTML Question

scraping a website to an array using table

can someone help please.

i want to scrape a website with a table in.

the table headers are

scheduled,
from,
flight,
status,
terminal,

i then want to output this to an array.

then i want to search the array outputting the data into text boxes.

can anyone help please.

Many thanks in advance

i have tried tried the below

<?php

$htmlContent = file_get_contents("http://www.gatwickairport.com/flights/");

$DOM = new DOMDocument();
$DOM->loadHTML($htmlContent);

$Header = $DOM->getElementsByTagName('th');
$Detail = $DOM->getElementsByTagName('td');

//#Get header name of the table
foreach($Header as $NodeHeader)
{
$aDataTableHeaderHTML[] = trim($NodeHeader->textContent);
}
//print_r($aDataTableHeaderHTML); die();

//#Get row data/detail table without header name as key
$i = 0;
$j = 0;
foreach($Detail as $sNodeDetail)
{
$aDataTableDetailHTML[$j][] = trim($sNodeDetail->textContent);
$i = $i + 1;
$j = $i % count($aDataTableHeaderHTML) == 0 ? $j + 1 : $j;
}
//print_r($aDataTableDetailHTML); die();

//#Get row data/detail table with header name as key and outer array index as row number
for($i = 0; $i < count($aDataTableDetailHTML); $i++)
{
for($j = 0; $j < count($aDataTableHeaderHTML); $j++)
{
$aTempData[$i][$aDataTableHeaderHTML[$j]] = $aDataTableDetailHTML[$i][$j];
}
}
$aDataTableDetailHTML = $aTempData; unset($aTempData);
print_r($aDataTableDetailHTML); die();
?>


but i need it to only do columns 1,2,3,4,5 not 0,6,7,8,9

Many thanks in advance

Answer

You could try along these lines - where you determine the column indices of the ones you wish to keep and use in_array to test

$cols=array(1,2,3,4,5);
for($i = 0; $i < count($aDataTableDetailHTML); $i++){
    for($j = 0; $j < count($aDataTableHeaderHTML); $j++){
        if( in_array( $j, $cols ) ) $aTempData[$i][$aDataTableHeaderHTML[$j]] = $aDataTableDetailHTML[$i][$j];
    }
}

To answer your subsequent question I hope the following might be of use to you. ( Try running it "as-is" to see the results )

$url='http://www.gatwickairport.com/flights/';
$html=file_get_contents( $url );


if( $html ){

    $dom=new DOMDocument;
    $dom->loadHTML( $html );
    $xp=new DOMXPath( $dom );


    $headers=array();
    $data=array();
    $cols=array(1,2,3,4,5);


    /* Get column headers */
    $col=$xp->query( '//div[@class="container"]/table/thead/tr/th' );

    /* Get & store the number of column headers */
    $length=$col->length;

    /* Iterate through headers and store the ones you wish to keep */
    if( !empty( $col ) ){
        foreach( $col as $i => $node ) {
            /* array/collection is zero based so we need one more as the index */
            $j=$i+1;
            if( $node->tagName=='th' && in_array( $j, $cols ) ) $headers[]=trim( $node->nodeValue );
        }
    }

    /* Get row data */
    $col=$xp->query( '//div[@class="container"]/table/tbody/tr/td' );
    $rows=$col->length;

    /* each row will have $length columns, there will be $rows rows */
    $j=1;
    $r=1;

    if( !empty( $col ) ){
        foreach( $col as $i => $node ) {

            if( $node->tagName=='td' && in_array( $j, $cols ) ) {
                /* Because the airline column displays an image, treat it differently */
                if( $j==1 && $node->hasChildNodes() && $node->childNodes->item(0)->tagName=='img' ){
                    $value=$node->childNodes->item(0)->getAttribute('alt');
                } else {
                    /* standard cell content */
                    $value=trim( $node->nodeValue );
                }

                /* Add new row/cell data to output */
                $data[ 'row_'.$r ][]=$value;
            }

            if( $j >= $length ) {
                $j=0;
                $r++;
            }
            $j++;
        }
    }


    /* do stuff with the output data ( 2 arrays ) */
    echo '<pre>',print_r($headers,true),'</pre>';
    echo '<pre>',print_r($data,true),'</pre>'; 
}