www.webdeveloper.com
Results 1 to 1 of 1

Thread: Missing data when scraping from website

Threaded View

  1. #1
    Join Date
    Sep 2008
    Posts
    62

    Missing data when scraping from website

    Hi guys,

    I have got a problem with scraping the data from a third party website. I'm currently using a preg_match_all method with each different title tags including the values to output the data from a third party website to my website where I can see some of the data are missing.


    Here's what the HTML is look like from a third party:
    Code:
        <span id="row1Time" class="zc-ssl-pg-time">9:00 AM</span>
        <a id="rowTitle1" class="zc-ssl-pg-title">CBS News Sunday Morning</a>
        <span id="row2Time" class="zc-ssl-pg-time">10:30 AM</span>
        <a id="rowTitle2" class="zc-ssl-pg-title">Face the Nation</a>
        <span id="row3Time" class="zc-ssl-pg-time">11:30 AM</span>
        <span id="rowTitle3" class="zc-ssl-pg-title">Local Programming</span>
        <span id="row4Time" class="zc-ssl-pg-time">12:00 PM</span>
        <a id="rowTitle4" class="zc-ssl-pg-title">The NFL Today</a>
        <span id="row5Time" class="zc-ssl-pg-time">1:00 PM</span>
        <a id="rowTitle5" class="zc-ssl-pg-title">NFL Football</a>
        <span id="row6Time" class="zc-ssl-pg-time">4:30 PM</span>
        <a id="rowTitle6" class="zc-ssl-pg-title"'>2013 U.S. Open Tennis</a>
        <span id="row7Time" class="zc-ssl-pg-time">7:00 PM</span>
        <span id="rowTitle7" class="zc-ssl-pg-title">Local Programming</span>
        <span id="row8Time" class="zc-ssl-pg-time">7:30 PM</span>
        <a id="rowTitle8" class="zc-ssl-pg-title">CBS Evening News</a>

    Here is the HTML output data on my website:
    Code:
        <span id='time1'>9:00 AM</span> - <span id='title1'>CBS News Sunday Morning</span><br></br>
        <span id='time2'>10:30 AM</span> - <span id='title2'>Face the Nation</span><br></br>
        <span id='time3'></span> - <span id='title3'></span><br></br>
        <span id='time4'>12:00 PM</span> - <span id='title4'>The NFL Today</span><br></br>
        <span id='time5'>3:30 PM</span> - <span id='title5'>The Bold and the Beautiful</span><br></br>
        <span id='time6'>4:00 PM</span> - <span id='title6'>The Talk</span><br></br>
        <span id='time7'></span> - <span id='title7'></span><br></br>
        <span id='time8'>7:30 PM</span> - <span id='title8'>CBS Evening News</span><br></br>

    Here's the php code:
    PHP Code:
        <?php
          define
    ('DB_HOST''localhost');
          
    define('DB_USER''myusername');
          
    define('DB_PASSWORD''mypassword');
          
    define('DB_DATABASE''mydb');
              
          
    $errmsg_arr = array();
          
    $errflag false;
          
    $link mysql_connect(DB_HOSTDB_USERDB_PASSWORD);
          
          if(!
    $link
          {
            die(
    'Failed to connect to server: ' mysql_error());
          }
        
          
    $db mysql_select_db(DB_DATABASE);
          if(!
    $db
          {
            die(
    "Unable to select database");
          }
        
          function 
    clean($var)
          {
            return 
    mysql_real_escape_string(strip_tags($var));
          }
          
    $channels clean($_GET['channels']);
          
    $id clean($_GET['id']);
          
          if(
    $errflag
          {
            
    $_SESSION['ERRMSG_ARR'] = $errmsg_arr;
            echo 
    implode('<br />',$errmsg_arr);
          }
          else 
          {
            
    $insert = array();
            
            if(isset(
    $_GET['channels'])) 
            {
              
    $insert[] = 'channels = \'' clean($_GET['channels']) .'\'';
            }
            if(isset(
    $_GET['id'])) 
            {
              
    $insert[] = 'id = \'' clean($_GET['id']) . '\'';
            }
            
            
            if(
    $channels && $id
            {
              
    $qrytable1="SELECT id, channels, links FROM tvguide WHERE channels='$channels' && id='$id'";
              
    $result1=mysql_query($qrytable1) or die('Error:<br />' $qry '<br />' mysql_error());
                  
                
              while (
    $row mysql_fetch_array($result1)) 
              {
            
                
    $links $row['links'];
                
    $data file_get_contents($links);
                
    preg_match_all('/<span id="row1Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle1\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches1);
                
    $time1 $matches[1];
                
    $titles1 $matches1[1];
                echo 
    "<span id='time1'>".$time1[1]."</span> - <span id='title1'>".$titles1[1]."</span><br></br>";
        
                
    preg_match_all('/<span id="row2Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle2\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches2);
                
    $time2 $matches[1];
                
    $titles2 $matches2[1];
                echo 
    "<span id='time2'>".$time2[1]."</span> - <span id='title2'>".$titles2[1]."</span><br></br>";
        
                
    preg_match_all('/<span id="row3Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle3\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches3);
                
    $time3 $matches[1];
                
    $titles3 $matches3[1];
                echo 
    "<span id='time3'>".$time3[1]."</span> - <span id='title3'>".$titles3[1]."</span><br></br>";
        
                
    preg_match_all('/<span id="row4Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle4\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches4);
                
    $time4 $matches[1];
                
    $titles4 $matches4[1];
                echo 
    "<span id='time4'>".$time4[1]."</span> - <span id='title4'>".$titles4[1]."</span><br></br>"
                
                
    preg_match_all('/<span id="row5Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle5\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches5);
                
    $time5 $matches[1];
                
    $titles5 $matches5[1];
                echo 
    "<span id='time5'>".$time5[1]."</span> - <span id='title5'>".$titles5[1]."</span><br></br>";
                
                
    preg_match_all('/<span id="row6Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle6\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches6);
                
    $time6 $matches[1];
                
    $titles6 $matches6[1];
                echo 
    "<span id='time6'>".$time6[1]."</span> - <span id='title6'>".$titles6[1]."</span><br></br>"
                
                
    preg_match_all('/<span id="row7Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle7\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches7);
                
    $time7 $matches[1];
                
    $titles7 $matches7[1];
                echo 
    "<span id='time7'>".$time7[1]."</span> - <span id='title7'>".$titles7[1]."</span><br></br>";
                
                
    preg_match_all('/<span id="row8Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle8\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches8);
                
    $time8 $matches[1];
                
    $titles8 $matches8[1];
                echo 
    "<span id='time8'>".$time8[1]."</span> - <span id='title8'>".$titles8[1]."</span><br></br>";
         }
              
    mysql_close($link);
            }
            else if(!
    $channels && ! $id
            {
              
    $qrytable1="SELECT id, channels, links FROM tvguide";
              
    $result1=mysql_query($qrytable1) or die('Error:<br />' $qry '<br />' mysql_error());
             
              while (
    $row mysql_fetch_array($result1)) 
              {
                echo 
    "<p id='channels'>";
                echo 
    $row['channels'];
                echo 
    "<p id='links'>";
                echo . 
    $row["channels"] . "&id=" $row["id"] .'</p>';
              }
            }
          }
        
    ?>
    Does anyone know how I can scrape the data using with the preg_match_all or similar method that I currently use including with the time and the title tags with the values so I can output the data without being missing?

    I tried with PHP DOM, but I have no idea how to scrape the ids and the classes.

    If you could post the example PHP DOM including with the ids and classes, I would be very grateful.

    Any advice would be much appreciated.

    Thanks in advance
    Last edited by mark107; 09-08-2013 at 01:51 PM.

Thread Information

Users Browsing this Thread

There are currently 1 users browsing this thread. (0 members and 1 guests)

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  
HTML5 Development Center



Recent Articles