www.webdeveloper.com
Results 1 to 4 of 4

Thread: webscrapper cURL ran out of memory

  1. #1
    Join Date
    Feb 2007
    Posts
    4

    webscrapper cURL ran out of memory

    Hi guys,

    Im trying to make a webscrapper and im having a huge problem when retrieving huge amount of data. I have tried to increase the memory through PHP.ini but its still doesnt solve the problem.

    The webscrapper I want to make is to retrieve data from journal database and put it into an excel file. While it is working with small datasets, it will run out of memory when retrieving large datasets.

    Here is the code :
    x
    PHP Code:

    function fetchRawData($url,$search,$currentpagenumber,$numpage,$numrecordtotalsofar) {
        if(
    $currentpagenumber<$numpage) {
            
    //initialise curl
            
    $url "http://ieeexplore.ieee.org/search/searchresult.jsp?queryText%3D".$search."&rowsPerPage=100&pageNumber=".$currentpagenumber."&resultAction=ROWS_PER_PAGE";
            echo 
    "<br>new url = ".$url."<br>";

            
    $ch curl_init();
            
    curl_setopt($ch,CURLOPT_URL,$url);
            
    curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
            
    curl_setopt($ch,CURLOPT_FAILONERROR,true);
            
    curl_setopt($ch,CURLOPT_FOLLOWLOCATION,true);
        
    //    curl_setopt($ch,CURLOPT_TIMEOUT,50000);
            
    curl_setopt($ch,156,500000000);
            
    curl_setopt($ch,155,500000000);
            
    curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,false);
            
    curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,false);
            
            
    $data=curl_exec($ch);
            if(!
    $data) {
                
    var_dump(curl_getinfo($ch));
                die();
            }

            
            

            
    //parsing data
            
    $parsedData = array();
            
    phpQuery::newDocumentHTML($data);
            
    $arrtitle = array();
            
            
    $posttitle=1;
            if(
    $currentpagenumber<2) {
                
    $numrecordsofar=1;
            } else
            {
                
    $numrecordsofar=$numrecordtotalsofar;
            }


            
    //get the title, author and year of publication
            
    foreach(pq("a") as $link) {
                
    $title pq($link)->text();
                if(
    $title) {
                    
    //use regular expression to get the relevant information
                    
    if (preg_match("*articleDetails.jsp*"pq($link)->attr('href'))&&$gettitle<1) {
                        if(!(
    preg_match("*View full abstract*"$title)))
                        {
                            
    $dummyvar=$numrecordsofar+$posttitle;
                            
    array_push($arrtitle,$title);
                            
    $countrecord++;
                            
    $gettitle=1;
                        }
                    }
                }
            }
            
            
    //get the number of data
            
    foreach(pq("span") as $link) {
                
    $title pq($link)->text();
                if(
    $title) {
                    if (
    preg_match("*display-status results-returned*"pq($link)->attr('class'))) {
                        
    $countnumberonly preg_replace("*Results returned*"""$title);
                        
    $totalpageint intval($countnumberonly);
                        
    //calculate how many pages needed and record the current page
                        
    $totalpageint intval($totalpageint 100)+2;
                    }
                }


            }

            
    //initialise write to excel
            
    $objPHPExcel = new PHPExcel();
            
    $objPHPExcel->getProperties()->setCreator("Maarten Balliauw")
                                         ->
    setLastModifiedBy("Maarten Balliauw")
                                         ->
    setTitle("PHPExcel Test Document")
                                         ->
    setSubject("PHPExcel Test Document")
                                         ->
    setDescription("Test document for PHPExcel, generated using PHP classes.")
                                         ->
    setKeywords("office PHPExcel php")
                                         ->
    setCategory("Test result file");
            
            
    // Set active sheet index to the first sheet, so Excel opens this as the first sheet
            
    $objPHPExcel PHPExcel_IOFactory::load("IEEE_Scrap.xlsx");
            
    $objPHPExcel->setActiveSheetIndex(0);
            
    $objPHPExcel->createSheet();
            
    $row $objPHPExcel->getActiveSheet()->getHighestRow()+1;

            
    //get data from arrays    
            
    for($j=0;$j<count($arrtitle);$j++) {
                if(isset(
    $arrtitle[$j])) {
                    
    $dummyvar=$numrecordsofar+$j;
                    
    $objPHPExcel->getActiveSheet()->SetCellValue('A'.$dummyvar,$arrtitle[$j]);
                } else {
                    
    $dummyvar=$numrecordsofar+$j;
                    
    $globalIEEE[$tempcount+$j][0]="No Data";
                    
    $objPHPExcel->getActiveSheet()->SetCellValue('A'.$dummyvar,"No Data");
                }
            }
            
    $objWriter = new PHPExcel_Writer_Excel2007($objPHPExcel);
            
    $objWriter->save('IEEE_Scrap.xlsx');
            
            
    //close curl and phpexcel
            
    curl_close($ch);
            unset(
    $ch);
            unset(
    $objPHPExcel);
            unset(
    $objWriter);
            
    $currentpagenumber++;
            
    $numrecordtotalsofar=$numrecordtotalsofar+$countrecord;
            
    set_time_limit(0);
            
    sleep(5);

            
    $rawHTML fetchRawData($url,$search,$currentpagenumber,$totalpageint,$numrecordtotalsofar);
            return 
    $data;
        }

    The logic is first I retrieve the data on a page then putting it into an array after parsing it then initalise phpexcel to write the data from the array into excel then unset cURL and phpexcel and then move on to next page.

    Sorry the code is a bit messy as I have tried so many modifications but still cant get it work. Please help me !

  2. #2
    Join Date
    Mar 2007
    Location
    localhost
    Posts
    2,511
    Have you tried to scrape the data and store as a file and then process the file on the server?

    Most web hosts only allow a set memory limit as well as time limit for script execution.
    Yes, I know I'm about as subtle as being hit by a bus..(\\.\ Aug08)
    Yep... I say it like I see it, even if it is like a baseball bat in the nutz... (\\.\ Aug08)
    I want to leave this world the same way I came into it, Screaming, Incontinent & No memory!
    I laughed that hard I burst my colostomy bag... (\\.\ May03)
    Life for some is like a car accident... Mine is like a motorway pile up...

    Problems with Vista? :: Getting Cryptic wid it. :: The 'C' word! :: Whois?

  3. #3
    Join Date
    Feb 2007
    Posts
    4
    Quote Originally Posted by \\.\ View Post
    Have you tried to scrape the data and store as a file and then process the file on the server?

    Most web hosts only allow a set memory limit as well as time limit for script execution.

    Hi and thanks for the reply. As you can see from the code, I "write" data from each page to an excel file before moving on to the next pages. The scrapper does work in a small datasets but when i try to feed around >40k data (only the title of the book) it rans out of memory. do you have suggestion on this please ? thank you

  4. #4
    Join Date
    Mar 2007
    Location
    localhost
    Posts
    2,511
    When I look at this code here
    PHP Code:
            $data=curl_exec($ch);
            if(!
    $data) {
                
    var_dump(curl_getinfo($ch));
                die();
            }

            
            

            
    //parsing data
            
    $parsedData = array();
            
    phpQuery::newDocumentHTML($data);
            
    $arrtitle = array(); 
    I see that you have dumped the cURL data in to a variable but you do not put it in to a physical file and you don't release the data from the cURL operation until the end of the script.

    What I am suggesting is to preserve the data in a temporary file or a database and parse the data in the file or database without having to run several operations at once in the use of additional variables.

    You have to remember that your file may have a file of 40K but the amount of memory being used for that file could be 20 or more MB. Take Google Chrome 809Kb ins size, it has 10 windows open, it is a very small binary yet it is eating up resources of 27.5MB

    File size does not equal the same size when in memory.
    Last edited by \\.\; 08-22-2013 at 01:53 PM.
    Yes, I know I'm about as subtle as being hit by a bus..(\\.\ Aug08)
    Yep... I say it like I see it, even if it is like a baseball bat in the nutz... (\\.\ Aug08)
    I want to leave this world the same way I came into it, Screaming, Incontinent & No memory!
    I laughed that hard I burst my colostomy bag... (\\.\ May03)
    Life for some is like a car accident... Mine is like a motorway pile up...

    Problems with Vista? :: Getting Cryptic wid it. :: The 'C' word! :: Whois?

Thread Information

Users Browsing this Thread

There are currently 1 users browsing this thread. (0 members and 1 guests)

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  
HTML5 Development Center



Recent Articles