Automatically scrape certain content on a website

I recommend installing a fresh Debian in a virtual machine as a purely text-based operating system to avoid any incompatibilities. Download a small amd64 image for installation here: https://www.debian.org/distrib/netinst Select “web server” and “SSH server” as software to install.
For virtualization I use Virtualbox: https://www.virtualbox.org/wiki/Downloads I use “bridged” network adapters all the time.
To gain access to linux I use putty.

Finally I use “symfony/panther” to achieve the goal to scrape websites by the use of logic patterns.

Install the environment

Login into linux. Install “composer” to install “symfony/panther” and some missing packages:

su root
apt install composer php-xml php-curl php-zip chromium-driver
exit

Make a directory in your home folder or elsewhere for your script

mkdir scrap
cd scrap

Install “symfony/panther” and “symfony/css-selector” as user

composer req symfony/panther
composer req symfony/css-selector

You don’t need to install as development.

Use of panther

Make a new file

nano scrap.php

Write your code. You can use my examples and execute on the command line as user with

php scrap.php

Examples

Example how to get a whole table with pages from a website

<?php

use Symfony\Component\Panther\Client;
use Facebook\WebDriver\Exception\TimeoutException;

// load panther
require __DIR__ . '/vendor/autoload.php';

// start browser and load website
$client = Client::createChromeClient();
$client->request( 'GET', 'https://testwebsite.org/test.html' );

// click link
$client->clickLink( 'Search' );

// wait for element
$client->waitFor( '#tabs' );

// select element 100 from list and click it
$input = $crawler->filterXPath( ".//select[@name='length']//option[@value='100']" );
$input->click();

// wait for visibility
$client->waitForVisibility( '#processing' );
try {
	$client->waitForInvisibility( '#processing' );
} catch ( TimeoutException $e ) {
	// Nothing
}

for ( $page = 1; $page <= 2; $page ++ ) {

	// get full html code
	$html    = $client->getCrawler()->html();
	$crawler = new Symfony\Component\DomCrawler\Crawler( $html );

	// get the whole table
	$result  = $crawler->filter( '#results' )->outerHtml();

	// write data to file
	$myfile = fopen( "table_" . $page . ".txt", "w" ) or die( "Unable to open file!" );
	fwrite( $myfile, $result );
	fclose( $myfile );

	// click next page
	$client->executeScript( "document.querySelector('#next').click()" );

	// wait for element
	$client->waitForVisibility( '#processing' );
	try {
		$client->waitForInvisibility( '#processing' );
	} catch ( TimeoutException $e ) {
		// Nothing
	}
}

Here is a example how to scrap a popup window from a table entry with pages

<?php

use Symfony\Component\Panther\Client;
use Facebook\WebDriver\Exception\TimeoutException;
use Facebook\WebDriver\Exception\NoSuchElementException;

// load panther
require __DIR__ . '/vendor/autoload.php';

// start browser and load website
$client = Client::createChromeClient();
$client->request( 'GET', 'https://testwebsite.org/test.html' );

// click link
$client->clickLink( 'Search' );

// wait for element
$crawler = $client->waitFor( '#tabssearch' );

// select element 100 from list and click it
$input = $crawler->filterXPath( ".//select[@name='length']//option[@value='100']" );
$input->click();

// wait for visibility
$client->waitForVisibility( '#processing' );

// wait for invisibility
try {
	$client->waitForInvisibility( '#processing' );
} catch ( TimeoutException $e ) {
	// Nothing
}

// take a screenshot
$client->takeScreenshot( 'screen.png' );

for ( $page = 1; $page < 3; $page ++ ) {
	for ( $row = 1; $row <= 100; $row ++ ) {
		if ( $page == 2 && $row > 4 ) {
			continue;
		}

		// click a image inside a table by row and col to open popup
		$client->executeScript( "document.querySelector('#results tbody tr:nth-of-type(" . $row . ") td:nth-of-type(1) img').click()" );
		// if it doesn't succeed repeat step
		try {
			$client->waitForVisibility( '#buttons' );
		} catch ( NoSuchElementException $e ) {
			$row -= 1;
			continue;
		}

		// get complete html code
		$html    = $client->getCrawler()->html();
		$crawler = new Symfony\Component\DomCrawler\Crawler( $html );
		// get html code of the popup
		$result = $crawler->filter( '.ui-dialog' )->outerHtml();
		// Write date to file
		$myfile = fopen( "item_" . $page . "_" . $row . ".txt", "w" ) or die( "Unable to open file!" );
		fwrite( $myfile, $result );
		fclose( $myfile );

		// Close popup
		$client->executeScript( "document.querySelector('#buttons #close').click()" );
		try {
			$client->waitForInvisibility( '#buttons' );
		} catch ( TimeoutException $e ) {
			// Nothing
		}
	}
	// click next page
	$client->executeScript( "document.querySelector('#next').click()" );
	$client->waitForVisibility( '#processing' );
	try {
		$client->waitForInvisibility( '#processing' );
	} catch ( TimeoutException $e ) {
		// Nothing
	}
}

Example to get the desired informations out of the files

<?php

require __DIR__ . '/vendor/autoload.php';

// create a mysql connection
$mysqli = mysqli_connect( "localhost", "test", "test", "new_database" );
mysqli_set_charset( $mysqli, "utf8mb4" );

$csv_head = "\"id\",\"name\",\"country\"";
$file     = fopen( 'names.csv', 'w' );
fwrite( $file, $csv_head . "\n" );
$countries = [];
$id        = 1;
$match     = [];

for ( $page = 1; $page <= 2; $page ++ ) {
	for ( $row = 1; $row <= 100; $row ++ ) {
		if ( $page == 2 & $row > 4 ) {
			break;
		}

		// get file content
		$html = file_get_contents( "./item_" . $page . "_" . $row . ".txt" );
		if(empty($html))
			die();

		// start crawler
		$crawler = new Symfony\Component\DomCrawler\Crawler();
		$crawler->addHtmlContent( $html );

		// get text inside the first element of the table
		try {
			$country = $crawler->filterXPath( '//*[@class="header"]/tbody/tr[1]/td[1]' )->text();
			// remove unwanted spaces
			$country = trim(preg_replace( '/\s+/', ' ', $country ));
			if ( empty( $country ) ) {
				continue;
			}
			// escape data for mysql import
			$country     = mysqli_real_escape_string( $mysqli, $country );
			$country     = str_replace( "\\'", "''", $country );
			$country_key = array_search( $country, $countries );
			// verify if country already take and asign unique id number
			if ( $country_key === false ) {
				$countries[] = $country;
				$country_key = array_search( $country, $countries );
			}
			$country_key = ( $country_key + 1 );
		} catch ( \InvalidArgumentException $e ) {
			// data has no country
			$country = "NULL";
		}


		try {
			$name = $crawler->filterXPath( '//*[@class="header"]/tbody/tr[2]/td[1]' )->text();
			$name = trim(preg_replace( '/\s+/', ' ', $name ));
			if ( empty( $name ) ) {
				continue;
			}
			$name = "\"" . mysqli_real_escape_string( $mysqli, $name ) . "\"";
			$name = str_replace( "\\\"", "\"\"", $name );
			$name = str_replace( "\\'", "''", $name );
			if ( in_array( $name, $match ) ) {
				continue;
			} else {
				$match[] = $name;
			}
		} catch ( \InvalidArgumentException $e ) {
			continue;
		}

		// Write data
		fwrite( $file, $id . "," . $name . "," . $country_key . "\n" );

		$id ++;

	}
}
fclose( $file );


$csv_head = "\"id\",\"country\"";
$file     = fopen( 'countries.csv', 'w' );
fwrite( $file, $csv_head . "\n" );
foreach ( $countries as $key => $type ) {
	fwrite( $file, ( $key + 1 ) . "," . "\"" . $type . "\"" . "\n" );
}
fclose( $file );


Problems

If you got a error like this in the /var/log/apache2/error.log file:

PHP Fatal error:  Uncaught RuntimeException: Could not start chrome. Exit code: 1 (General error). Error output: /system.slice/apache2.service is not a snap cgroup

You can’t start the script with a browser. You have to execute your script on the command line. Like:

php script_name.php

If you get this error:

PHP Fatal error:  Uncaught Facebook\WebDriver\Exception\SessionNotCreatedException: session not created: Chrome failed to start: exited normally.
  (session not created: DevToolsActivePort file doesn't exist)
  (The process started from chrome location /usr/bin/chromium is no longer running, so ChromeDriver is assuming that Chrome has crashed.) 

Try to execute the code as non root.