www.webdeveloper.com
Results 1 to 1 of 1

Thread: Missing data when scraping from website

  1. #1
    Join Date
    Sep 2008
    Posts
    62

    Missing data when scraping from website

    Hi guys,

    I have got a problem with scraping the data from a third party website. I'm currently using a preg_match_all method with each different title tags including the values to output the data from a third party website to my website where I can see some of the data are missing.


    Here's what the HTML is look like from a third party:
    Code:
        <span id="row1Time" class="zc-ssl-pg-time">9:00 AM</span>
        <a id="rowTitle1" class="zc-ssl-pg-title">CBS News Sunday Morning</a>
        <span id="row2Time" class="zc-ssl-pg-time">10:30 AM</span>
        <a id="rowTitle2" class="zc-ssl-pg-title">Face the Nation</a>
        <span id="row3Time" class="zc-ssl-pg-time">11:30 AM</span>
        <span id="rowTitle3" class="zc-ssl-pg-title">Local Programming</span>
        <span id="row4Time" class="zc-ssl-pg-time">12:00 PM</span>
        <a id="rowTitle4" class="zc-ssl-pg-title">The NFL Today</a>
        <span id="row5Time" class="zc-ssl-pg-time">1:00 PM</span>
        <a id="rowTitle5" class="zc-ssl-pg-title">NFL Football</a>
        <span id="row6Time" class="zc-ssl-pg-time">4:30 PM</span>
        <a id="rowTitle6" class="zc-ssl-pg-title"'>2013 U.S. Open Tennis</a>
        <span id="row7Time" class="zc-ssl-pg-time">7:00 PM</span>
        <span id="rowTitle7" class="zc-ssl-pg-title">Local Programming</span>
        <span id="row8Time" class="zc-ssl-pg-time">7:30 PM</span>
        <a id="rowTitle8" class="zc-ssl-pg-title">CBS Evening News</a>

    Here is the HTML output data on my website:
    Code:
        <span id='time1'>9:00 AM</span> - <span id='title1'>CBS News Sunday Morning</span><br></br>
        <span id='time2'>10:30 AM</span> - <span id='title2'>Face the Nation</span><br></br>
        <span id='time3'></span> - <span id='title3'></span><br></br>
        <span id='time4'>12:00 PM</span> - <span id='title4'>The NFL Today</span><br></br>
        <span id='time5'>3:30 PM</span> - <span id='title5'>The Bold and the Beautiful</span><br></br>
        <span id='time6'>4:00 PM</span> - <span id='title6'>The Talk</span><br></br>
        <span id='time7'></span> - <span id='title7'></span><br></br>
        <span id='time8'>7:30 PM</span> - <span id='title8'>CBS Evening News</span><br></br>

    Here's the php code:
    PHP Code:
        <?php
          define
    ('DB_HOST''localhost');
          
    define('DB_USER''myusername');
          
    define('DB_PASSWORD''mypassword');
          
    define('DB_DATABASE''mydb');
              
          
    $errmsg_arr = array();
          
    $errflag false;
          
    $link mysql_connect(DB_HOSTDB_USERDB_PASSWORD);
          
          if(!
    $link
          {
            die(
    'Failed to connect to server: ' mysql_error());
          }
        
          
    $db mysql_select_db(DB_DATABASE);
          if(!
    $db
          {
            die(
    "Unable to select database");
          }
        
          function 
    clean($var)
          {
            return 
    mysql_real_escape_string(strip_tags($var));
          }
          
    $channels clean($_GET['channels']);
          
    $id clean($_GET['id']);
          
          if(
    $errflag
          {
            
    $_SESSION['ERRMSG_ARR'] = $errmsg_arr;
            echo 
    implode('<br />',$errmsg_arr);
          }
          else 
          {
            
    $insert = array();
            
            if(isset(
    $_GET['channels'])) 
            {
              
    $insert[] = 'channels = \'' clean($_GET['channels']) .'\'';
            }
            if(isset(
    $_GET['id'])) 
            {
              
    $insert[] = 'id = \'' clean($_GET['id']) . '\'';
            }
            
            
            if(
    $channels && $id
            {
              
    $qrytable1="SELECT id, channels, links FROM tvguide WHERE channels='$channels' && id='$id'";
              
    $result1=mysql_query($qrytable1) or die('Error:<br />' $qry '<br />' mysql_error());
                  
                
              while (
    $row mysql_fetch_array($result1)) 
              {
            
                
    $links $row['links'];
                
    $data file_get_contents($links);
                
    preg_match_all('/<span id="row1Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle1\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches1);
                
    $time1 $matches[1];
                
    $titles1 $matches1[1];
                echo 
    "<span id='time1'>".$time1[1]."</span> - <span id='title1'>".$titles1[1]."</span><br></br>";
        
                
    preg_match_all('/<span id="row2Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle2\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches2);
                
    $time2 $matches[1];
                
    $titles2 $matches2[1];
                echo 
    "<span id='time2'>".$time2[1]."</span> - <span id='title2'>".$titles2[1]."</span><br></br>";
        
                
    preg_match_all('/<span id="row3Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle3\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches3);
                
    $time3 $matches[1];
                
    $titles3 $matches3[1];
                echo 
    "<span id='time3'>".$time3[1]."</span> - <span id='title3'>".$titles3[1]."</span><br></br>";
        
                
    preg_match_all('/<span id="row4Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle4\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches4);
                
    $time4 $matches[1];
                
    $titles4 $matches4[1];
                echo 
    "<span id='time4'>".$time4[1]."</span> - <span id='title4'>".$titles4[1]."</span><br></br>"
                
                
    preg_match_all('/<span id="row5Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle5\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches5);
                
    $time5 $matches[1];
                
    $titles5 $matches5[1];
                echo 
    "<span id='time5'>".$time5[1]."</span> - <span id='title5'>".$titles5[1]."</span><br></br>";
                
                
    preg_match_all('/<span id="row6Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle6\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches6);
                
    $time6 $matches[1];
                
    $titles6 $matches6[1];
                echo 
    "<span id='time6'>".$time6[1]."</span> - <span id='title6'>".$titles6[1]."</span><br></br>"
                
                
    preg_match_all('/<span id="row7Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle7\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches7);
                
    $time7 $matches[1];
                
    $titles7 $matches7[1];
                echo 
    "<span id='time7'>".$time7[1]."</span> - <span id='title7'>".$titles7[1]."</span><br></br>";
                
                
    preg_match_all('/<span id="row8Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im'$data$matches);
                
    preg_match_all('/<a id="rowTitle8\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im'$data$matches8);
                
    $time8 $matches[1];
                
    $titles8 $matches8[1];
                echo 
    "<span id='time8'>".$time8[1]."</span> - <span id='title8'>".$titles8[1]."</span><br></br>";
         }
              
    mysql_close($link);
            }
            else if(!
    $channels && ! $id
            {
              
    $qrytable1="SELECT id, channels, links FROM tvguide";
              
    $result1=mysql_query($qrytable1) or die('Error:<br />' $qry '<br />' mysql_error());
             
              while (
    $row mysql_fetch_array($result1)) 
              {
                echo 
    "<p id='channels'>";
                echo 
    $row['channels'];
                echo 
    "<p id='links'>";
                echo . 
    $row["channels"] . "&id=" $row["id"] .'</p>';
              }
            }
          }
        
    ?>
    Does anyone know how I can scrape the data using with the preg_match_all or similar method that I currently use including with the time and the title tags with the values so I can output the data without being missing?

    I tried with PHP DOM, but I have no idea how to scrape the ids and the classes.

    If you could post the example PHP DOM including with the ids and classes, I would be very grateful.

    Any advice would be much appreciated.

    Thanks in advance
    Last edited by mark107; 09-08-2013 at 12:51 PM.

Thread Information

Users Browsing this Thread

There are currently 1 users browsing this thread. (0 members and 1 guests)

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  
HTML5 Development Center

"

"

X vBulletin 4.2.2 Debug Information

  • Page Generation 0.31448 seconds
  • Memory Usage 2,951KB
  • Queries Executed 13 (?)
More Information
Template Usage (33):
  • (1)SHOWTHREAD
  • (1)ad_footer_end
  • (1)ad_footer_start
  • (1)ad_global_above_footer
  • (1)ad_global_below_navbar
  • (1)ad_global_header1
  • (1)ad_global_header2
  • (1)ad_navbar_below
  • (1)ad_showthread_firstpost_sig
  • (1)ad_showthread_firstpost_start
  • (1)ad_thread_first_post_content
  • (2)bbcode_code
  • (1)bbcode_php
  • (1)footer
  • (1)forumjump
  • (1)forumrules
  • (1)gobutton
  • (1)header
  • (1)headinclude
  • (1)headinclude_bottom
  • (1)memberaction_dropdown
  • (1)navbar
  • (4)navbar_link
  • (1)navbar_moderation
  • (1)navbar_noticebit
  • (1)navbar_tabs
  • (2)option
  • (1)postbit
  • (1)postbit_onlinestatus
  • (1)postbit_wrapper
  • (1)spacer_close
  • (1)spacer_open
  • (1)tagbit_wrapper 

Phrase Groups Available (6):
  • global
  • inlinemod
  • postbit
  • posting
  • reputationlevel
  • showthread
Included Files (26):
  • ./showthread.php
  • ./global.php
  • ./includes/class_bootstrap.php
  • ./includes/init.php
  • ./includes/class_core.php
  • ./includes/config.php
  • ./includes/functions.php
  • ./includes/functions_navigation.php
  • ./includes/class_friendly_url.php
  • ./includes/class_hook.php
  • ./includes/class_bootstrap_framework.php
  • ./vb/vb.php
  • ./vb/phrase.php
  • ./includes/functions_facebook.php
  • ./includes/functions_calendar.php
  • ./includes/functions_bigthree.php
  • ./includes/class_postbit.php
  • ./includes/class_bbcode.php
  • ./includes/functions_reputation.php
  • ./includes/functions_notice.php
  • ./packages/vbattach/attach.php
  • ./vb/types.php
  • ./vb/cache.php
  • ./vb/cache/db.php
  • ./vb/cache/observer/db.php
  • ./vb/cache/observer.php 

Hooks Called (70):
  • init_startup
  • friendlyurl_resolve_class
  • init_startup_session_setup_start
  • database_pre_fetch_array
  • database_post_fetch_array
  • init_startup_session_setup_complete
  • global_bootstrap_init_start
  • global_bootstrap_init_complete
  • cache_permissions
  • fetch_threadinfo_query
  • fetch_threadinfo
  • fetch_foruminfo
  • load_show_variables
  • load_forum_show_variables
  • global_state_check
  • global_bootstrap_complete
  • global_start
  • style_fetch
  • global_setup_complete
  • showthread_start
  • showthread_getinfo
  • strip_bbcode
  • friendlyurl_clean_fragment
  • friendlyurl_geturl
  • forumjump
  • cache_templates
  • cache_templates_process
  • template_register_var
  • template_render_output
  • fetch_template_start
  • fetch_template_complete
  • parse_templates
  • fetch_musername
  • notices_check_start
  • notices_noticebit
  • process_templates_complete
  • friendlyurl_redirect_canonical
  • showthread_post_start
  • showthread_query_postids
  • showthread_query
  • bbcode_fetch_tags
  • bbcode_create
  • showthread_postbit_create
  • postbit_factory
  • postbit_display_start
  • postbit_imicons
  • bbcode_parse_start
  • bbcode_parse_complete_precache
  • bbcode_parse_complete
  • postbit_display_complete
  • memberaction_dropdown
  • tag_fetchbit_complete
  • forumrules
  • navbits
  • navbits_complete
  • build_navigation_data
  • build_navigation_array
  • check_navigation_permission
  • process_navigation_links_start
  • process_navigation_links_complete
  • set_navigation_menu_element
  • build_navigation_menudata
  • build_navigation_listdata
  • build_navigation_list
  • set_navigation_tab_main
  • set_navigation_tab_fallback
  • navigation_tab_complete
  • fb_like_button
  • showthread_complete
  • page_templates