#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
use File::Basename;
use DBI;
use LWP::UserAgent;


my $dbh=DBI->connect('DBI:mysql:nine:localhost',
		     'e3', '', { RaiseError => 1 }) #turn back on later
  or die "Cannot connect: " . $DBI::errstr;

my $saved=0;
my $skipped=0;
my $inserted=0;
my $quiet=0;
my $table='nine';
my $MAX_CONTENT=5000000;
my $MAX_TITLE_DISPLAY=90;
my $MAX_TITLE=600;
my $MAX_URL_LEN=9000;

count($table) if table_exists($table);

binmode STDOUT, ':utf8';

my @urls=@ARGV;

if (!@urls) {
  if (-t STDIN ) {
    @urls=<STDIN>;
  }
  else {
    @urls=<>;
  }
}

if (!@urls) {
  *ARGV=*DATA;
}

foreach my $url (@urls) {
  chomp $url;
  next if $url =~ /#######/;

  if ($url =~ /^-q$/) {
    $quiet=1;
    next;
  }

  $saved=0;
  if ($url =~ /pdf$/i) {
    $saved=getPDF($url);
  }

  last if $url =~ /__END__/;    #Perl should do this.

  $url=parseUrl($url);

  my $stmt=sprintf "SELECT id,size FROM %s WHERE url='%s'", $table, $url;
  my($id, $size)=reap($stmt);

  my $inDb=0;
  if($id) {
    $inDb=1;
    printf "%s: %s %s\n", $id, $size, $url;
  } else {
    $inDb=insert($url);
  }
  my $stash=saveUrl($url,$inDb);
  printf "Stashed uri in %s\n", $stash if -f $stash;
}

count($table);

printf "%ss: %s urls skipped, %s inserted.\n", (time-$^T), $skipped, $inserted;
exit 0;


sub parseUrl {
  my($uri, @rest)=@_;

  if (length($uri) > $MAX_URL_LEN) {
    printf "url too long: %s bytes\n", length($uri);
    return 0;
  }

  return 1 if ($uri =~ /^-q$/);	#shh

  $uri =~ s/^'//;	       #In case of embedded # on command line.
  $uri =~ s/'$//;
  $uri =~ s/^\s+//;		#Strip leading spaces
  $uri =~ s/\s+$//;		#Strip trailing spaces
  return $uri;
}

sub table_exists {
  my($table, $loud)=@_;
  my $stmt=sprintf "SHOW TABLES like \'%%%s%%\'", $table;

  my @tables=reap($stmt);
  my $t;
  foreach $t (@tables) {
    if ($t =~ /$table/) {
      printf "Found table %s\n", $t if $loud;
      return $t;
    }
  }
  return init($t);
}


sub init {
  my $newsize=$_;
  $MAX_CONTENT=$newsize if ($newsize);

  drop_table($table);

  #Put content in its own table with id as key:
  my $stmt=sprintf "CREATE TABLE %s ( id MEDIUMINT NOT NULL AUTO_INCREMENT,
                          url VARCHAR(512) UNIQUE NOT NULL,
                          title VARCHAR(200),
                          content VARCHAR(%d),
                          size MEDIUMINT,
                          PRIMARY KEY (id)
    )", $table, $MAX_CONTENT;

  my $result=sow($stmt);
  return $result or 1;
}


sub sow {
  my($statement, $loud)=@_;
  printf "sowing %s\n", $statement if $loud;

  my $sthandle=$dbh->prepare($statement) or die $dbh->errstr;
  my $result=$sthandle->execute();

  my $e=$dbh->errstr;
  swing_into_action($e) if length($e);
  return $result;
}


sub reap {
  my($statement, $loud)=@_;
  printf "reaping: %s\n", $statement if $loud;

  my $sth=$dbh->prepare($statement); # prepare the query
  $sth->execute();		     # execute the query

  my @vals=$dbh->selectrow_array($statement);

  my $e=$dbh->errstr;
  swing_into_action($e) if ($e);

  return @vals;
}


sub harvest {
  my($statement, $loud)=@_;
  #printf "reaping: %s\n",$statement if $loud;

  my $data_ref=$dbh->selectall_arrayref($statement, { 'Columns' => {} });
  if (!$quiet) {
    for my $row (@$data_ref) {
      print "$_ => $row->{$_}\n" for keys %$row;
    }
  }

  my $e=$dbh->errstr;
  swing_into_action($e) if length($e);
  return $data_ref;
}


sub swing_into_action {
  my $err=$_;
  if ($err) {
    die $err if ($err =~ /Specified key was too long/);
    warn $err;

    if (($err =~ /Table.*doesn't exist/)
	or ($err =~ /Unknown table/)
	or ($err =~ /Unknown column 'file'/)) {
      init($table);
    }
    die $err;
  }
}


sub count {
  my($table, $loud)=@_;
  my $stmt=sprintf 'SELECT COUNT(id) FROM %s', $table;
  my @counts=reap($stmt);
  foreach my $count (@counts) {
    next unless $count =~ /^\d+$/;
    printf "%s ids found.\n", $count if $loud;
    return $count;
  }
  return -1;
}


sub insert {
  my($url, @rest)=@_;
  return 0 if ($url eq '');

  my $ua=new LWP::UserAgent;
  $ua->agent("Mozilla/8.0");    #Fake out Ceasefire mag.

  my $req=new HTTP::Request 'GET' => "$url";
  $req->header('Accept' => 'text/html');

  # send request
  my $error;
  my $resp=$ua->request($req);
  my $content=$resp->decoded_content;

  # check the outcome
  if ($resp->is_success) {
    #Save to file for cf. to db content.
    #print $resp->decoded_content;
  } else {
    #Make an array of status codes and loop it.
    $error=$resp->status_line . "\n";
    if ($error) {
      chomp $error;
      if ($error =~ /301/) {
      }

      if ($error =~ /302 Found/) {
	printf "Curse ye NYT\nYARR\n";
      }

      if ($error =~ /400 URL must be absolute/) {
      }
      if ($error =~ /403 Forbidden/) {
      }
      if ($error =~ /404.*Not Found/) {
      }
      if ($error =~ /406 Not Acceptable/) {
      }
      if ($error =~ /500 Can't connect/) {
      }
      if ($error =~ /501 Protocol scheme/) {
	printf "'https' is not supported (LWP::Protocol::https not installed)\n";
      }
      printf "Error: %s\n", $error;
      return $error;
    }
    
    if ($error =~ /503: Service Unavailable:/) {
      # http://www.zdnet.com/news/spy-agency-taps-into-undersea-cable/115877
    }
  }

  my $size=length($content);

  if ($size == 0) {
    $skipped++;
    return 0;
  } else {
    #printf STDERR "Got %s bytes at %s\n", $size, $url;
    if ($size > $MAX_CONTENT) {
      init($size);
    }
  }

  my $title=$content;

  #This takes forever with a pdf.
  #Find a way to adapt title search by MIME type?

  if ($title =~ s/^.*<title>//imsx) {
    #printf STDERR "Found title..";
  } else {
    printf STDERR "Found no title tag\n";
    $title=$url;
  }

  if ($title =~ s#</title>.*##imsx) {
    #printf STDERR ". and title end... ";
  } else {
    printf STDERR "...anywhere.  ";
  }

  $title =~ s/^\s+//;
  $title =~ s/\s+$//;

  my $l=length($title);

  if ($l == 0) {
    warn "No title!";
  }

  if ($l > $MAX_TITLE) {
    printf "%s is too big for a title.  Next!\n%s\n", $l, $title;
    return -1;
  }

  if ($l > $MAX_TITLE_DISPLAY) {
    $title=substr($title, 0, $MAX_TITLE_DISPLAY-$l);
  }

  $url=$dbh->quote($url);
  $content=$dbh->quote($content);
  $title=$dbh->quote($title);

  #Placeholders are your friends.
  my $st=
    sprintf "INSERT INTO %s (url,title,content,size) VALUE (%s,%s,%s,%s)",
      $table, $url, $title, $content, $size; #Size of unquoted content.

  $inserted += sow($st);

  $st=sprintf "SELECT `id`,`title`,`url` FROM %s WHERE `url`=%s", $table, $url;
  my $sth=$dbh->prepare($st) or die $dbh->errstr;
  $sth->execute();

  my $id;
  ($id, $title, $url, @rest)=reap($st);

  my $len=length($title);
  printf "Id: %s  ",     $id   if ($id);
  printf "Content: %s ", $size if ($size);
  printf "Title: %s\n",  $len  if ($len);
  printf "%s\n", $url;

  return 1;
}


sub drop_table {
  my($t, @rest)=@_;
  #my $cmd='mysqldump -p -uroot articles >articles.sql';
  #my $cmd='backup.sh';
  #printf "%s\n", $cmd;
  #my($result)=`$cmd`;

  my $smt=sprintf "DROP TABLE IF EXISTS %s", $t;
  printf "Dropping table %s\n", $t;
  sow($smt);
  return 0;
}


sub list_articles {
  my $st="SELECT `id`,`size`,`title` FROM " . $table;
  my $loud=1;
  my $dref=harvest($st, $loud);

  for my $row (@$dref) {
    for (keys %$row) {
      $row->{$_} =~ s/^.*<title>//im;
      $row->{$_} =~ s^<title/>.*$^^im;
      print "$_ => $row->{$_}\n" unless $quiet;
    }
  }
  exit 0;
}


sub saveUrl {
  my($uri,$inDb)=@_;
  chomp $uri;
  my $ustash="$ENV{DOCUMENTS}";
  die "DOCUMENTS not defined\n" unless -d $ustash;

  if ($inDb) {
    $ustash .= '/urls.txt';
  } else {
    $ustash .= '/urls.unused.txt';
  }

  my $I; #Earache

  my @uris=();
  if (open($I, "<", $ustash)) {
    while(<$I>){
      chomp;
      push @uris, $_;
    }
  }

  my $skipit=0;
  foreach my $u (@uris) {
    if ($u eq $uri) {
      $skipit=1;
      return ''; # last;
    }
  }
  close $I;

  push(@uris,$uri);
  open(my $O, ">", $ustash) or die $!;
  foreach my $u (@uris) {
    printf $O "%s\n", $u;
  }
  close $O;
  return $ustash;
}

sub getPDF {
  my($uri, @rest)=@_;
  my $pdffile=basename($uri);
  $pdffile=qq/$pdffile/;
  $pdffile =~ s/%20/ /g;

  if (-f "$pdffile" ) {
    #system("/usr/bin/evince $pdffile &");
    return 0;
  }

  my $ua=new LWP::UserAgent;
  $ua->agent("Mozilla/8.0");    #Fake out Ceasefire mag.

  my $req=new HTTP::Request 'GET' => "$uri";
  $req->header('Accept' => 'application/pdf');


  # send request
  my $error;
  my $resp=$ua->request($req);
  my $content=$resp->decoded_content;

  # check the outcome
  if ($resp->is_success) {

    open my $P, ">",  $pdffile or die $!; #s/b binary mode?
    binmode $P, ':utf8';
    printf $P "%s\n", $resp->decoded_content;
    close $P;

    die $! if (!-f $pdffile);
    #Save to file for cf. to db content.
    #print $resp->decoded_content;
    return 1;
  }
  return 0;
}

__DATA__

__END__
Implementing basename in SQL:
CREATE TEMPORARY TABLE t (`id` mediumint unique, `size` mediumint);
SELECT RIGHT(url, LOCATE('/',REVERSE(url))-1) FROM articles;

GRANT SELECT, INSERT, UPDATE, DELETE, CREATE, DROP ON `nine`.`nine` TO 'e3'@'%';
Query OK, 0 rows affected (0.34 sec)

mysql> show grants for e3;
+---------------------------------------------------------------------------------------------------------------------------------+
| Grants for e3@%                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------------------+
| GRANT SELECT, INSERT, UPDATE, DELETE, CREATE, DROP, RELOAD, FILE, LOCK TABLES, REPLICATION CLIENT, SHOW VIEW ON *.* TO 'e3'@'%' |
| GRANT ALL PRIVILEGES ON `test`.* TO 'e3'@'%'                                                                                    |
| GRANT ALL PRIVILEGES ON `articles`.* TO 'e3'@'%'                                                                                |
| GRANT ALL PRIVILEGES ON `sd`.`sd` TO 'e3'@'%'                                                                                   |
| GRANT ALL PRIVILEGES ON `articles`.`articles` TO 'e3'@'%'                                                                       |
| GRANT SELECT, INSERT, UPDATE, DELETE, CREATE, DROP ON `mysql`.`articles` TO 'e3'@'%'                                            |
| GRANT SELECT, INSERT, UPDATE, DELETE, CREATE, DROP ON `nine`.`nine` TO 'e3'@'%'                                                 |
| GRANT ALL PRIVILEGES ON `jobs`.`jobs` TO 'e3'@'%'                                                                               |
| GRANT ALL PRIVILEGES ON `music`.`mp3s` TO 'e3'@'%'                                                                              |
+---------------------------------------------------------------------------------------------------------------------------------+
9 rows in set (0.00 sec)

mysql> 
#############################################################################
