<?php
#	rip2db
#
#	A small language to process web pages, extract information and place it into a database.
#
#	Requirements	: Runs under UNIX with PHP and MySQL.
#	Version		: 1.2
#	Author		: Mike Robinson	
#	Date		: 2008-MAR-15
#	Usage		: php rip2db example_1
#	Web page	: www.bikesandkites.com/rip2db.html
#
# doc
#	tar cvzf rip2db.tgz rip2db
#	zip -r rip2db rip2db
#	SAVE url NAME str
#	have title variable with title of page
#	ignore case completely
#	LOOP_URL RECURSE
#	no queue stuff any more
#
# to do
#	IGNORE
#	GET uses stripos but should be regexp
#	sybase connection test
#	robots.txt not tested
#	errors should use die
#	control structure by indent
#	store file types not recognised + pattern that matched
#	mark global var
#
# bugs

#
#	these need expanding (a lot)
#
$image_postfixes = array( 'jpg','gif','png' );
$audio_postfixes = array( 'mp3','ram','ogg','wav' );
$video_postfixes = array( 'avi','mp4','mpg','wmv','mov' );


#
#	internal global var
#
$log_file = "./rip2db.log";
$saved_dir = "./save";
$init_file = "global";
$db_type = "";
$db_server = "";
$db_db = "";
$db_user = "";
$db_pwd = "";
$cached_fc = "";
$total_size = 0;
$program = "";
$ptr = 0;
$ignore_flg = 0;
$recurse[] = "";

# empty arrays
$word = array_shift( $recurse );

###################################

function init_variables() {
	GLOBAL $glob_var;

	$glob_var[ "DELAY_BETWEEN_LOADS" ] = 10;
	$glob_var[ "MAX_NUM_LOADS" ] = 300;
	$glob_var[ "SEQ" ] = 0;
	$glob_var[ "DEBUG" ] = 0;
	$glob_var[ "TITLE" ] = "";
	$glob_var[ "CURRENT_FILE" ] = "";
	$glob_var[ 'CURRENT_BASE' ] = "";
	$glob_var[ "VERSION" ] = "rip2db v1.5";
	$glob_var[ "NUM_LOADS" ] = 0;
}

###################################

function show_variables() {
	GLOBAL $glob_var;

	echo "Variables are: \n";

	foreach( array_keys( $glob_var ) as $key ) {
		echo "   $key = " . $glob_var[$key] . "\n";
	}
}

###################################

function url_section( $section, $url ) {

	if ( $url == '' ) { return ''; }

	$url_bits = parse_url( $url );
	$url_host = $url_bits['host'] ; 

	if ( $section == 'HOST' ) {
		return $url_host;

	} elseif ( $section == 'DIR' ) { 
		$url_path = ereg_replace( "^.*$url_host" ,'', $url );
		if ( ereg( '\/.*\/.*', $url_path ) ) {
			$url_path = ereg_replace( '\/[^\/]*$' ,'', $url_path );

			return "${url_host}${url_path}";
		} else {
			return $url_host;
		}

	} else {
		log_msg( "ERROR", "Can only return HOST or DIR" ); 
		return '';
	}
}

###################################

function find_end_tag( $type ) {
	GLOBAL $ptr;
	GLOBAL $program;

	# find matching END_URL, END_LOOP
	$cnt = 1;

	while ( $cnt > 0 and $ptr < strlen( $program ) ) {
		$word = read_word( 1 );
		if ( $word == "LOOP_$type" ) { $cnt += 1; }
		if ( $word == "END_$type" ) { $cnt -= 1; }
	}
}

###################################

function show_size( $size ) {

	$size_str = "";

	if ( $size > 1000000000 ) {
		$size_str = intval($size/1000000) . "gb";
	} else if ( $size > 1000000 ) {
		$size_str = intval($size/1000000) . "mb";
	} else if ( $size > 1000 ) {
		$size_str = intval($size/1000) . "kb";
	} else {
		$size_str = intval($size) . "b";
	}

	return $size_str;
}

###################################

function log_msg( $type, $msg ) {
	GLOBAL $fh;
	GLOBAL $glob_var;

	$date_time = date( 'ymd H:i' );

	if ( $type == "ERROR" or $type == "WARNING" ) { $msg = "$type: $msg"; }
	$msg = "$date_time $msg\n";

	if ( $type == 'ERROR' ) {

		fwrite( $fh, $msg );
		fclose( $fh );

		echo "===============================================\n";
		echo $msg;
		echo "===============================================\n";

		if ( $glob_var[ 'DEBUG' ] ) { show_variables(); }

		exit;
	} else if ( $type == 'WARNING' ) {
		fwrite( $fh, $msg );
		echo "===============================================\n";
		echo $msg;
		echo "===============================================\n";
	} else if ( $type == 'LOG' ) {
		fwrite( $fh, $msg );
	} else if ( $type == 'INFO1' and $glob_var[ 'DEBUG' ] ) {
		echo $msg;
	} else if ( $type == 'INFO2' and $glob_var[ 'DEBUG' ] > 1 ) {
		echo $msg;
	}
}

###################################

function run_sql( $sql ) {
	GLOBAL $db_type;
	GLOBAL $db_server;
	GLOBAL $db_db;
	GLOBAL $db_user;
	GLOBAL $db_pwd;

        # remove bad char
        $sql = ereg_replace( "[".chr(0x01)."-".chr(0x1F)."]", " ", $sql );
        $sql = ereg_replace( "[ \t]+", " ", $sql );

        # run sql
        $retry = 0;

	if ( $db_type == 'MYSQL' ) { 
	        $result = mysql_query( $sql ) or $retry = 1;
	} else if ( $db_type == 'SYBASE' ) {
	        $result = sybase_query( $sql ) or $retry = 1;
	} else {
		log_msg( "ERROR", "DB can only connect to MYSQL or SYBASE" ); 
	}


        # have a second go if fails
        if ( $retry > 0 ) {
                sleep(5);

                # reconnect
		if ( $db_type == 'MYSQL' ) { 
			mysql_pconnect( $db_server, $db_user, $db_pwd ) 
				or log_msg( "ERROR", "cannot connect to server $db_server" );

			mysql_select_db( $db_db ) 
				or log_msg( "ERROR", "cannot connect to database $db_db" );
		} else if ( $db_type == 'SYBASE' ) {
			sybase_connect( $db_server, $db_user, $db_pwd ) 
				or log_msg( "ERROR", "cannot connect to server $db_server" );

			sybase_select_db( $db_db ) 
				or log_msg( "ERROR", "cannot connect to database $db_db" );
		} else {
			log_msg( "ERROR", "DB can only connect to MYSQL or SYBASE" ); 
		}

                sleep(2);

                # retry sql
		if ( $db_type == 'MYSQL' ) { 
			$result = mysql_query( $sql ) or log_msg( "ERROR", "SQL error\n$sql" );	
		} else if ( $db_type == 'SYBASE' ) {
			$result = sybase_query( $sql ) or log_msg( "ERROR", "SQL error\n$sql" );	
		} else {
			log_msg( "ERROR", "DB can only connect to MYSQL or SYBASE" ); 
		}
        }

        return $result;
}



###################################

function glue_url($parsed)
{
	# from multiple authors on http://uk.php.net/function.parse-url site

    if (!is_array($parsed)) return false;
    $uri = isset($parsed['scheme']) ? $parsed['scheme'].':'.((strtolower($parsed['scheme']) == 'mailto') ? '' : '//') : '';
    $uri .= isset($parsed['user']) ? $parsed['user'].(isset($parsed['pass']) ? ':'.$parsed['pass'] : '').'@' : '';
    $uri .= isset($parsed['host']) ? $parsed['host'] : '';
    $uri .= isset($parsed['port']) ? ':'.$parsed['port'] : '';
    if(isset($parsed['path'])) {
        $uri .= (substr($parsed['path'], 0, 1) == '/') ? $parsed['path'] : ('/'.$parsed['path']);
    }
    $uri .= isset($parsed['query']) ? '?'.$parsed['query'] : '';
    $uri .= isset($parsed['fragment']) ? '#'.$parsed['fragment'] : '';
    return $uri;
}

###################################

function resolve_url($base, $url) {
	# from multiple authors on http://uk.php.net/function.parse-url site

        if (!strlen($base)) {
		if ( ! ereg( '://', $url ) ) {
			$url = "http://$url";
		}
		return $url;
	}
        // Step 2
        if (!strlen($url)) return $base;
        // Step 3
        if (preg_match('!^[a-z]+:!i', $url)) return $url;
        $base = parse_url($base);
        if ($url{0} == "#") {
                // Step 2 (fragment)
                $base['fragment'] = substr($url, 1);
                return glue_url($base);
        }
        unset($base['fragment']);
        unset($base['query']);
        if (substr($url, 0, 2) == "//") {
                // Step 4
                return glue_url(array(
                        'scheme'=>$base['scheme'],
                        'path'=>substr($url,2),
                ));
        } else if ($url{0} == "/") {
                // Step 5
                $base['path'] = $url;
        } else {
                // Step 6
    		if(!isset($base['path'])) {  $base['path'] = ''; }

		$path = explode('/', $base['path']);
		$url_path = explode('/', $url);
		// Step 6a: drop file from base
		array_pop($path);
		// Step 6b, 6c, 6e: append url while removing "." and ".." from
		// the directory portion
		$end = array_pop($url_path);
		foreach ($url_path as $segment) {
			if ($segment == '.') {
				// skip
			} else if ($segment == '..' && $path && $path[sizeof($path)-1] != '..') {
				array_pop($path);
			} else {
				$path[] = $segment;
			}
		}
		// Step 6d, 6f: remove "." and ".." from file portion
		if ($end == '.') {
			$path[] = '';
		} else if ($end == '..' && $path && $path[sizeof($path)-1] != '..') {
			$path[sizeof($path)-1] = '';
		} else {
			$path[] = $end;
		}
		// Step 6h
		$base['path'] = join('/', $path);
        }
        // Step 7
        return glue_url($base);
} 

###################################

function read_word( $skip_replace_var ) {
	GLOBAL $ptr;
	GLOBAL $program;
	GLOBAL $recurse;
	GLOBAL $glob_var;
	GLOBAL $image_postfixes;
	GLOBAL $audio_postfixes;
	GLOBAL $video_postfixes;

	while( $ptr < strlen( $program ) and $program[ $ptr ] == ' ' ) { $ptr ++; };

	if ( $ptr >= strlen( $program ) ) {
		return '';
	}

	if ( $program[ $ptr ] == "'" or $program[ $ptr ] == '"' ) { 
		$end_char = $program[ $ptr ];

		$ptr ++; 
		$start = $ptr;

		while( $ptr < strlen( $program ) and $program[ $ptr ] != $end_char ) { $ptr ++; };

		if ( $ptr >= strlen( $program ) ) { log_msg( 'ERROR', "Unclosed quote" ); }
		$end = $ptr - 1;

		$ptr ++; 
	} else {

		$start = $ptr;

		while( $ptr < strlen( $program ) and $program[ $ptr ] != " " ) { $ptr ++; };

		$end = $ptr - 1;
	}

	if ( 1 + $end - $start < strlen( $program ) ) {
		$word = substr( $program, $start, 1 + $end - $start );
	} else {
		$word = substr( $program, $start );
	}
	
	if ( $skip_replace_var == 0 ) {
		if ( eregi( '{PCT_DONE}', $word ) ) {
			$pct_done = 0;
			$pct_not_done = 0;

			foreach( array_keys( $recurse ) as $key ) {
				$file_type = strtolower( ereg_replace( '^.*\.', '', basename( $key ) ) );
				if (	$recurse[$key]["pattern"] == "href"
					and url_section( 'DIR', $key ) == 
						url_section( 'DIR', $glob_var[ "CURRENT_FILE" ] )
					and ! in_array( $file_type, $audio_postfixes )
					and ! in_array( $file_type, $video_postfixes )
					and ! in_array( $file_type, $image_postfixes ) ) {

					if ( $recurse[$key]["loaded"] == 1 ) { $pct_done ++; }
					if ( $recurse[$key]["loaded"] == 0 ) { $pct_not_done ++; }
				}
			}

			$pct = sprintf( "%3d", ( $pct_done * 100 ) / ( $pct_done + $pct_not_done ) );
			$word = eregi_replace( '{PCT_DONE}', strval( $pct ) . '%', $word );
		}
		if ( eregi( '{SEQ}', $word ) ) {
			$glob_var[ 'SEQ' ] ++;
			$word = eregi_replace( '{SEQ}', strval( $glob_var['SEQ'] ), $word );
		}
		if ( eregi( '{DATE}', $word ) ) {
			$word = eregi_replace( '{DATE}', date( 'ymd' ), $word );
		}
		if ( eregi( '{TIME}', $word ) ) {
			$word = eregi_replace( '{TIME}', date( 'H:i' ), $word );
		}

		foreach( array_keys( $glob_var ) as $key ) {
			$word = eregi_replace( "\{$key\}", strval( $glob_var[$key] ), $word );
		}

		if ( ereg( '{[A-Za-z0-9_]+}', $word ) ) {
			log_msg( 'WARNING', "Unknown variable $word" ); 
		}
	}

	return $word;
}

###################################

function get_next_command( ) {
	#
	# read next command
	#
	$word = read_word( 0 );
	$word = strtoupper( $word );

	return $word;
}

###################################

function set_var( $var, $val, $file ) {
	GLOBAL $glob_var;

	if ( $var > '' ) {
		if ( eregi( '_URL$', $var ) ) {
			log_msg( "INFO2", "Processing $var value $val as URL" );
			$val = ereg_replace( '^[ =>]+', '', $val);
			$val = ereg_replace( '[ ><].*$', '', $val);
			$val = resolve_url( $file, $val );
		}

		if ( eregi( '_TXT$', $var ) ) {
			log_msg( "INFO2", "Processing $var value $val as TXT" );
			$val = ereg_replace( '\&[a-z]+;', '', $val );
			$val = strip_tags( $val );
			$val = html_entity_decode( $val, ENT_NOQUOTES );
			$val = ereg_replace( '  +', ' ', $val );
			$val = trim( ereg_replace( '[^ &#;a-zA-Z0-9,.:]', '', $val) );
		}

		$glob_var[ $var ] = $val;
		log_msg( "INFO1", "  $var = $val" );
	}

	return '';
}

###################################

function be_a_good_robot( $file ) {
	GLOBAL $glob_var;

	if ( url_section( 'HOST', $glob_var[ 'CURRENT_BASE' ] ) != url_section( 'HOST', $file ) ) {

		$str = @file_get_contents( "$base_host/robots.txt" );
		$str = eregi_replace( '.*Crawl-delay: *', '', $str );
		$str = ereg_replace( '[^0-9].*', '', $str );

		if ( $str > '' ) {
			log_msg( 'LOG', "Found robots.txt delay on $base_host of $str" );
			$glob_var[ "DELAY_BETWEEN_LOADS" ] = $str;
		} else {
			$glob_var[ "DELAY_BETWEEN_LOADS" ] = 10;
		}

		log_msg( "INFO1", "Delay between loads set to " . $glob_var[ "DELAY_BETWEEN_LOADS" ] );
	}
}

###################################

function load_contents( $new_file ) {
	GLOBAL $glob_var;
	GLOBAL $cached_fc;
	GLOBAL $total_size;
	GLOBAL $ignore_flg;

	be_a_good_robot( $new_file );

	if ( $new_file == "" ) { return ''; }

	# is this file already cached
	if ( $new_file == $glob_var[ "CURRENT_FILE" ] ) {
		$str = $cached_fc;

		log_msg( 'LOG', "Loaded $new_file from cache (length " . 
				show_size( strlen($str) ) . ")" );
	} else {
		$str = @file_get_contents( $new_file );
		$cached_fc = $str;
		$glob_var[ "CURRENT_FILE" ] = $new_file;
		$glob_var[ "NUM_LOADS" ] ++;

		log_msg( 'INFO1', "COMMAND: LOAD $new_file" );

		# sleep between loads
		if ( $glob_var[ "NUM_LOADS" ] > 6 ) {
			sleep( $glob_var[ "DELAY_BETWEEN_LOADS" ] );
		}

		# limit to number of downloads
		if ( 0 + $glob_var[ "NUM_LOADS" ] > 0 + $glob_var[ "MAX_NUM_LOADS" ] ) {
			log_msg( 'ERROR', "Limited to " . 
				strval( $glob_var[ "MAX_NUM_LOADS" ] ) . " loads" );
		}

		$cnt = strlen($str);
		$total_size += $cnt;
		log_msg( 'LOG', "Loaded $new_file (length " . show_size( $cnt ) . ")" );
	}

	# clean text
	$str = ereg_replace( "'", '', $str );
	$str = ereg_replace( '"', '', $str );
	$str = ereg_replace( "[".chr(0x01)."-".chr(0x1F)."]", " ", $str );

	# read title variable
	$title = eregi_replace( '</title.*', '', $str );
	$title = eregi_replace( '.*title *>', '', $title );
	$glob_var[ "TITLE" ] = $title;
	$ignore_flg = 0;

	return $str;
}

###################################

function process_cmds( $file, $fc ) {
	GLOBAL $glob_var;
	GLOBAL $ptr;
	GLOBAL $program;
	GLOBAL $db_type;
	GLOBAL $db_server;
	GLOBAL $db_db;
	GLOBAL $db_user;
	GLOBAL $db_pwd;
	GLOBAL $cached_fc;
	GLOBAL $recurse;
	GLOBAL $total_size;
	GLOBAL $image_postfixes;
	GLOBAL $audio_postfixes;
	GLOBAL $video_postfixes;
	GLOBAL $saved_dir;
	GLOBAL $ignore_flg;

	# get next program word
	$word = get_next_command();

	#
	#	loop through commands
	#
	while( $ptr < strlen( $program ) ) {

		#
		#	load the url into the contents variable
		#
		if ( $word == 'LOAD' ) {
			$file = resolve_url( $file, read_word( 0 ) );
			log_msg( 'INFO1', "COMMAND: LOAD $file" );

			$str = load_contents( $file );

			# if file contains data then recurse with data
			if ( $str > '' ) {
				process_cmds( $file, $str );
			} else {
				log_msg( "WARNING" , "Empty file $file" );
			}

			return '';

		#
		#	reload cached copy of last file loaded
		#
		} else if ( $word == 'RELOAD' ) {
			if ( $glob_var[ "CURRENT_FILE" ] > '' ) {
				$str = $cached_fc;

				log_msg( 'LOG', "Reloaded " . $glob_var[ "CURRENT_FILE" ] . " from cache" );
			} else {
				log_msg( "ERROR" , "No file to RELOAD" );
			}

		#
		#	turn debugging on or off
		#
		} else if ( $word == 'DEBUG' ) {
			log_msg( 'INFO1', "COMMAND: DEBUG" );

			if ( $glob_var[ "DEBUG" ] > 0 ) {
				$glob_var[ "DEBUG" ] = 0;
			} else {
				$glob_var[ "DEBUG" ] = 1;
				$glob_var[ "MAX_NUM_LOADS" ] = 20;
			}

		#
		#	turn debugging on more or off
		#
		} else if ( $word == 'DEBUG_FULL' ) {
			log_msg( 'INFO1', "COMMAND: DEBUG_FULL" );

			if ( $glob_var[ "DEBUG" ] > 0 ) {
				$glob_var[ "DEBUG" ] = 0;
			} else {
				$glob_var[ "DEBUG" ] = 2;
				$glob_var[ "MAX_NUM_LOADS" ] = 20;
			}

		#
		#	turn debugging on more or off
		#
		} else if ( $word == 'SAVE' ) {

			$url = resolve_url( $file, read_word( 0 ) );

			@$load_file = fopen( $url, "rb" );

			if ( ! $load_file ) { 
				log_msg( 'WARNING', "Could not copy $url" ); 
			} else { 

				$current_pointer = $ptr;
				$word = get_next_command();

				if ( $word == 'NAME' ) {
					$filename = $saved_dir . "/" . read_word( 0 );
				} else {
					$ptr = $current_pointer;
					$filename = $saved_dir . "/" . basename( $url ); 
				}

	                	$filename = eregi_replace( "['\"\(\)]","",$filename );
	                	$filename = eregi_replace( " ","_",$filename );

				log_msg( 'INFO1', "COMMAND: SAVE $url to $filename" );
				log_msg( 'LOG', "saving $url to $filename" );

				$sfh = fopen( "$filename", "wb" ); 

				while( ! feof( $load_file) ) { 
					$line = fread( $load_file, 1028 ); 
					fwrite( $sfh,$line ); 
					$total_size += 1028;
				} 

				fclose( $sfh ); 
				$glob_var[ "NUM_LOADS" ] ++;
				sleep( $glob_var[ "DELAY_BETWEEN_LOADS" ] );
			} 


		#
		#	loop through SQL results
		#
		} else if ( $word == 'LOOP_SQL' ) {
			$sql = read_word( 0 );
			$current_pointer = $ptr;

			log_msg( 'INFO1', "COMMAND: LOOP_SQL $sql" );
			log_msg( "LOG", "SQL $sql" );

			$result = run_sql( $sql );

			if ( $db_type == 'MYSQL' ) { 
				while( $row = mysql_fetch_assoc( $result ) ) {
					$data[] = $row;
				}
			} else if ( $db_type == 'SYBASE' ) {
				while( $row = sybase_fetch_assoc( $result ) ) {
					$data[] = $row;
				}
			} else {
				log_msg( "ERROR", "DB can only connect to MYSQL or SYBASE" ); 
			}


			foreach( array_keys( $data ) as $i ) {
				if ( $i == 1 ) {
					foreach( array_keys( $data[$i] ) as $col ) {
						log_msg( 'INFO1', "   ==> SQL column $col" );
					}
				}
				foreach( array_keys( $data[$i] ) as $col ) {
					$glob_var[$col] = $data[$i][$col];
				}
				process_cmds( $file, 'LOOP_SQL' );

				$ptr = $current_pointer;
			}

			find_end_tag( 'SQL' );
	
	
		#
		#	end table
		#
		} else if ( $word == 'END_SQL' ) {
			log_msg( 'INFO1', "COMMAND: END_SQL" );
			return '';


		#
		#	loop through table entries
		#
		} else if ( $word == 'LOOP_TABLE' ) {

			$word = read_word( 0 );
			log_msg( 'INFO1', "COMMAND: LOOP_TABLE $word" );

	                $tables = spliti( '<table', $fc );

			if ( count( $tables ) == 0 ) { log_msg( 'ERROR', "No tables to extract" ); }

			if ($word == '0' or $word == '-1') {
				# split url contents into sets of tables
				$max_cnt = -1;
				$tab = 0;

				# loop through each
				for( $i=0;$i<count($tables);$i++ ) {
					$tmp_cnt = count( spliti( '< *TR',$tables[$i] ) );
					log_msg( 'INFO1', "table $i has $tmp_cnt rows" );

					# and find the one with most entries
					if ( $tmp_cnt > $max_cnt ) {
						$max_cnt = $tmp_cnt;
						$tab = $i;
					}
				}

				log_msg( 'INFO1', "chosen table $tab" );
			} else {
				$tab = $word;
			}

			# get that table into fc
                	$fc = "<" . $tables[$tab];
                	$fc = eregi_replace( ".*</TH *>","",$fc );
                	$fc = eregi_replace( "</TABLE.*","",$fc );
                	$fc = eregi_replace( "</TR>","",$fc );

			$current_pointer = $ptr;
			$started = 0;

			foreach( spliti( '< *tr[^>]*>' ,$fc ) as $rec ) {
				log_msg( 'INFO2', "table record=$rec" );

				# clear column variables
				foreach( array_keys( $glob_var ) as $key ) {
					if ( ereg( '^COL_', $key ) ) { $glob_var[$key] = ""; }
				}

				if ( $started ) {
					$c = 0;
					$col_started = 0;

					# get columns into COL_1 variables
					foreach( spliti( '< *td[^\>]*\>' ,$rec ) as $col ) {
						if ( $c > 0 ) {
							$glob_var[ "COL_$c" ] = strip_tags( $col );
						}
						$c ++;
					}

					if ( $c > 0 ) {
						process_cmds( $file, 'LOOP_TABLE' );
					}

					$ptr = $current_pointer;
				} else {
					$started = 1;
				}
			}

			# find matching END_TABLE
			find_end_tag( 'TABLE' );


		#
		#	end table
		#
		} else if ( $word == 'END_TABLE' ) {
			log_msg( 'INFO1', "COMMAND: END_TABLE" );
			return '';

		#
		#	split the remaining file conents by the string
		#	and do following commands with each section
		#
		} else if ( $word == 'LOOP' ) {
			$word = read_word( 0 );
			$cnt = count( spliti( $word ,$fc ) );
			log_msg( 'INFO1', "COMMAND: LOOP $word  --> split into $cnt sections" );

			$current_pointer = $ptr;
			$started = 0;

			foreach( spliti( $word ,$fc ) as $part ) {
				if ( $started ) {
					log_msg( "INFO2", "  contents " . substr($fc,0,50) . "..." );
					process_cmds( $file, $part );
				} else {
					$started = 1;
				}

				$ptr = $current_pointer;
			}

			# find matching END_LOOP
			find_end_tag( 'LOOP' );

		#
		#	END_LOOP
		#
		} else if ( $word == 'END_LOOP' ) {
			log_msg( 'INFO1', "COMMAND: END_LOOP" );
			return '';

		#
		#	LOOP_URL
		#
		} else if ( $word == 'LOOP_URL' ) {

			$current_pointer = $ptr;
			$word = get_next_command();

			if ( $word == 'RECURSE' ) {
				$var1 = read_word( 0 );
				$var2 = read_word( 0 );

				$glob_var[ $var1 ] = "";
				$glob_var[ $var2 ] = "";
				$current_pointer = $ptr;

				# check recurse not already in action
				if ( count( $recurse ) > 1 ) {
					log_msg( 'ERROR', "LOOP_URL RECURSE cannot be nested" );
				}

				# init with current page
				$recurse = array( $file => array( "pattern"=>"href", "loaded"=>1 ) );

				# while pages left to read
				while ( $fc > '' ) {

					# holds url, pattern, loaded_flg
					# look to add all local, underdir, href links not alread in list
					# maintain pct_done
					# ignore command
					foreach(array(	'[ 	]*href[ ="\']',
							'[	]*src[ ="\']*',
							'[ 	]*url[ ="\']*' ) as $pattern ) {

						$matches = spliti( $pattern, $fc );

						array_shift( $matches );

						# get rid of matches that are already stored

						foreach( $matches as $part_fc ) {

							# keep url
							$url = ereg_replace( '[ "\'\>\#\?].*', '', $part_fc );
							$url = resolve_url($file, $url);

							# if not already in our recurse array
							if ( ! in_array( $url, array_keys( $recurse ) ) ) {
								# if img then look for alt
								if ( ereg( 'img', $pattern ) ) {
									$str = ereg_replace( '\>.*', '', $part_fc );
									$str = eregi_replace( '^.*alt[ ="\']*', '', $str );
									$str = ereg_replace( '["\'].*', '', $str );
								} else if ( ereg( 'href', $pattern ) ) {
									$str = ereg_replace( '^[^\>]*\>', '', $part_fc );
									$str = eregi_replace( '\< *\/ *a *\>.*', '', $str );
								} else {
									$str = "";
								}

								# strip tags
								$str = trim( strip_tags( $str ) );

								log_msg( 'INFO1', "COMMAND: LOOP_URL looping on $url" );
								set_var( $var1, $url, $file );
								set_var( $var2, $str, $file );

								process_cmds( $file, 'LOOP_URL' );

								if ( ! $ignore_flg ) {
									# add element to array
									$recurse[$url] = array( 
										"pattern"=>eregi_replace( '[^a-z]','',
												$pattern), 
										"loaded"=>0 );
								}

							}

							$ptr = $current_pointer;
						}
					}

					$fc = "";

					# find another url to search through
					# not exactly recursive then :)
					foreach( array_keys( $recurse ) as $url ) {
						# get file type
						$file_type = strtolower( ereg_replace( '^.*\.', '', basename( $url ) ) );

						if (	$recurse[$url]["loaded"] == 0
							&& $recurse[$url]["pattern"] == 'href'
							&& url_section( 'DIR', $url ) == 
								url_section( 'DIR', $glob_var[ "CURRENT_FILE" ] )
							&& ! in_array( $file_type, $video_postfixes )
							&& ! in_array( $file_type, $image_postfixes )
							&& ! in_array( $file_type, $audio_postfixes ) ) {

							$recurse[$url]["loaded"] = 1;
							$fc = load_contents( $url );

							break;
						}
					}
				}

				# find matching END_URL
				find_end_tag( 'URL' );
			} else {
				$ptr = $current_pointer;

				$var1 = read_word( 0 );
				$var2 = read_word( 0 );

				$glob_var[ $var1 ] = "";
				$glob_var[ $var2 ] = "";
				$current_pointer = $ptr;

				foreach(array(	'[ 	]*href[ ="\']',
						'[	]*src[ ="\']*',
						'[ 	]*url[ ="\']*' ) as $pattern ) {

					$matches = spliti( $pattern, $fc );

					array_shift( $matches );

					foreach( $matches as $part_fc ) {

						# delete up to and inc pattern
						#$part_fc = eregi_replace( "^.*$pattern", '', $part_fc );

						# keep url
						$url = ereg_replace( '[ "\'\>].*', '', $part_fc );
						$url = resolve_url($file, $url);

						# if img then look for alt
						if ( ereg( 'img', $pattern ) ) {
							$str = ereg_replace( '\>.*', '', $part_fc );
							$str = eregi_replace( '^.*alt[ ="\']*', '', $str );
							$str = ereg_replace( '["\'].*', '', $str );
						} else if ( ereg( 'href', $pattern ) ) {
							$str = ereg_replace( '^[^\>]*\>', '', $part_fc );
							$str = eregi_replace( '\< *\/ *a *\>.*', '', $str );
						} else {
							$str = "";
						}

						# strip tags
						$str = trim( strip_tags( $str ) );

						log_msg( 'INFO1', "COMMAND: LOOP_URL looping on $url" );
						set_var( $var1, $url, $file );
						set_var( $var2, $str, $file );

						process_cmds( $file, 'LOOP_URL' );

						$ptr = $current_pointer;
					}
				}

				# find matching END_URL
				find_end_tag( 'URL' );
			}

		#
		#	END_URL
		#
		} else if ( $word == 'END_URL' ) {
			log_msg( 'INFO1', "COMMAND: END_URL" );
			return '';

		#
		#	IGNORE
		#
		} else if ( $word == 'IGNORE' ) {
			log_msg( 'INFO1', "COMMAND: IGNORE" );
			$ignore_flg = 1;

			return '';

		#
		#	IF
		#
		} else if ( $word == 'IF' ) {

			$overall_flg = true;
			$stage_flg = false;
			$last_flg = false;
			$not = "";
			$pat = "";


			do {
				$val = read_word( 0 );
				$word = get_next_command();

				if ( $word == 'NOT' ) {
					$action = true;
					$word = get_next_command();
					$not = "NOT";
				} else {
					$action = false;
					$not = "";
				}

				if ( $word == 'LIKE' ) { 
					$pat = read_word( 0 );
					$last_flg = ( $action xor eregi( $pat, $val ) );
				} else if ( $word == 'IS_AUDIO' ) { 
					$val = strtolower( ereg_replace( '^.*\.', '', basename( $val ) ) );
					$last_flg = ( $action xor in_array( $val, $audio_postfixes ) );

				} else if ( $word == 'IS_VIDEO' ) { 
					$val = strtolower( ereg_replace( '^.*\.', '', basename( $val ) ) );
					$last_flg = ( $action xor in_array( $val, $video_postfixes ) );

				} else if ( $word == 'IS_IMAGE' ) { 
					$val = strtolower( ereg_replace( '^.*\.', '', basename( $val ) ) );
					$last_flg = ( $action xor in_array( $val, $image_postfixes ) );

				} else if ( $word == 'IS_UNDER_DIR' ) { 
					$last_flg = (	$action 
							xor 
							url_section( 'DIR', $val ) == url_section( 'DIR', $file ) );

				} else if ( $word == 'IS_LOCAL' ) { 
					$last_flg = (	$action 
							xor 
							url_section( 'HOST', $val ) == url_section( 'HOST', $file ) );
				} else {
					log_msg( 'ERROR', "IF str [NOT] (LIKE str|IS_IMAGE|IS_AUDIO|IS_VIDEO|" .
							"IS_LOCAL|IS_UNDER_DIR) but not $word" );
				}

				log_msg( 'INFO1', "COMMAND: IF $val $not $word $pat" );
				if ( $overall_flg ) { $overall_flg = $last_flg; }

				$current_pointer = $ptr;
				$word = get_next_command();

				if ( $word == 'OR' ) {
					if ( $overall_flg ) { $stage_flg = true; }
					$overall_flg = true;
					$last_flg =  false;
				}
			} while( $word == 'AND' or $word == 'OR' );
				
			$ptr = $current_pointer;

			# if failed
			if ( ! ( $overall_flg and $last_flg ) and ! $stage_flg ) {
				find_end_tag( 'IF' );
			}

		#
		#	END_IF
		#
		} else if ( $word == 'END_IF' ) {
			$ptr = $ptr;

		#
		#	GET
		#
		} else if ( $word == 'GET' ) {
			$str = read_word( 1 );
			$var = '';

			log_msg( 'INFO1', "COMMAND: GET $str" );
			log_msg( "INFO2", "  from " . substr($fc,0,50) . "..." );

			# loop through str 'pat{var}pat{var}pat'
			do {
				#	get next pattern
				if ( ereg( '{', $str ) ) {
					$pat = substr( $str, 0, stripos( $str, '{' ) );
					if ( $pat == '' ) { $pat = '$'; }
					$str = substr( $str, stripos( $str, '{' ) +1 );

					# if var already declared then set it
					if ( $var > '' ) {
						# change to regexp
						$val = substr( $fc, 0, stripos( $fc, $pat ) );
						set_var( $var, $val, $file );
					}

					# get var
					$var = substr( $str, 0, stripos( $str, '}' ) );
					$str = substr( $str, stripos( $str, '}' ) +1 );

					if ( $pat > '' ) {
						# remove starting pattern from fc
						# change to regexp
						$fc = substr( $fc, stripos( $fc, $pat ) + strlen( $pat ) );
					}
				} else {
					$pat = $str;
					if ( $pat == '' ) { $pat = '$'; }

					if ( $var > '' ) {
						# change to regexp
						$val = substr( $fc, 0, stripos( $fc, $pat ) );
						set_var( $var, $val, $file );
					}

					$str = '';
					$var = '';

					# remove starting pattern from fc
					# change to regexp
					$fc = substr( $fc, stripos( $fc, $pat ) );
				}
			} while ( $str > '' );

			if ( $var > '' ) {
				set_var( $var, $fc, $file );
			}

		#
		#	comment
		#
		} else if ( $word == '/*' ) {
			while ( $word != '*/' and $ptr < strlen( $program ) ) {
				# we don't want to expand var names etc here
				$word = read_word( 1 );
			}

			if ( $ptr >= strlen( $program ) ) { log_msg( 'ERROR', "Unclosed comment" ); }
			log_msg( 'INFO1', "COMMAND: /* comment */" );

		#
		#	set variable to next string
		#
		} else if ( $word == 'SET' ) {
			$var = read_word( 0 );
			$val = read_word( 0 );
			set_var( $var, $val, $file );

			log_msg( 'INFO1', "COMMAND: SET $var $val" );

		#
		#	replace every occurenc e of from with to in variable
		#
		} else if ( $word == 'REPLACE' ) {
			$var = read_word( 0 );
			$from = read_word( 0 );
			$to = read_word( 0 );

			log_msg( 'INFO1', "COMMAND: REPLACE $var $from $to" );

			if ( !array_search( $var, $glob_var ) ) { 
				log_msg( 'ERROR', "Could not find variable $var" ); 
			}

			$val = eregi_replace( $from, $to, $glob_var[ $var ] );
			set_var( $var, $val, $file );

		#
		#	connect to database
		#
		} else if ( $word == 'DB' ) {
			$db_type = read_word( 0 );
			$db_server = read_word( 0 );
			$db_db = read_word( 0 );
			$db_user = read_word( 0 );
			$db_pwd = read_word( 0 );

			log_msg( 'INFO1', "COMMAND: DB connecting to $db_type database" );

			if ( $db_type == 'MYSQL' ) { 
				mysql_pconnect( $db_server, $db_user, $db_pwd ) 
					or log_msg( "ERROR", "$db_type cannot connect to server $db_server" );

				mysql_select_db( $db_db ) 
					or log_msg( "ERROR", "$db_type cannot connect to database $db_db" );
			} else if ( $db_type == 'SYBASE' ) {
				sybase_connect( $db_server, $db_user, $db_pwd ) 
					or log_msg( "ERROR", "$db_type cannot connect to server $db_server" );

				sybase_select_db( $db_db ) 
					or log_msg( "ERROR", "$db_type cannot connect to database $db_db" );
			} else {
				log_msg( "ERROR", "DB can only connect to MYSQL or SYBASE" ); 
			}


		#
		#	run the next string as sql into the database
		#
		} else if ( $word == 'SQL' ) {
			$sql = read_word( 0 );
			log_msg( 'INFO1', "COMMAND: SQL $sql" );

			$result = run_sql( $sql );
			log_msg( "LOG", "SQL $sql" );

		#
		#	write the next string to output
		#
		} else if ( $word == 'WRITE' ) {
			$word = read_word( 0 );
			log_msg( 'INFO1', "COMMAND: WRITE" );
			echo "$word\n";

		#
		#	else unknown command
		#
		} else {
			log_msg( 'ERROR', "Unknown command $word" );
		}


		#
		# read next command
		#
		$word = get_next_command();
	}

	return '';
}

###################################

#
#	read in program and clean string
#
$fh = fopen( $log_file, 'a' );

foreach ( $argv as $key => $program_file ) { 
	if ( $key > 0 ) {
		$program = "";
		$ptr = 0;
		$time_started = time();
		init_variables();

		#
		#	read init
		#
		$fp = @fopen( $init_file, 'r');
		while($line = fgets($fp) ) { $program .= $line; }
		fclose( $fp );
		$program .= " ";

		#
		#	read program
		#
		$fp = @fopen( $program_file, 'r') or log_msg( "ERROR", "Cannot find program $program_file" );
		while($line = fgets($fp) ) { $program .= $line; }
		fclose( $fp );

		log_msg( 'LOG', "===================================" );
		log_msg( 'LOG', "= Running program $program_file" );
		log_msg( 'LOG', "===================================" );
        	$program = ereg_replace( "[".chr(0x01)."-".chr(0x1F)."]", " ", $program ) . '    ';

		#
		#	do commands
		#
		process_cmds( '', 'Start' );

		#
		#	say that finished
		#
		$msg = "Finished $program_file, loaded " . $glob_var["NUM_LOADS"] . 
			" files with total size " . show_size( $total_size ) . " in " .
			strval( time() - $time_started ) . " secs";

		if ( $glob_var[ 'DEBUG' ] ) { show_variables(); }

		log_msg( 'LOG', $msg );
		echo "$msg\n";
	}
}

fclose( $fh );

exit;
?>
