123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225 |
- #!/usr/bin/perl
- use strict;
- use Digest::SHA qw(sha256_hex);
- use HTTP::Tiny;
- use DBI qw(:sql_types);
- use Getopt::Long;
- use Mojo::DOM;
- use HTML::WikiConverter;
- use Switch;
- use String::Escape;
-
- use Data::Dumper::Perltidy;
-
- my $import_list;
- my $export;
- my $config;
-
- sub trim {
- (my $s = $_[0]) =~ s/^\s+|\s+$//g;
- return $s;
- }
-
- sub connect_db($$$$$){
- my ($driver, $database, $user, $password, $host) = @_;
- my $dbh = 0;
- my $dsn = "DBI:$driver:dbname=$database";
-
- if( $user && $password){
- $dbh = DBI->connect($dsn, $user, $password, { RaiseError => 1 })
- or die $DBI::errstr;
- }elsif( $user ){
- $dbh = DBI->connect($dsn, $user, { RaiseError => 1 })
- or die $DBI::errstr;
- }else{
- $dbh = DBI->connect($dsn, { RaiseError => 1 })
- or die $DBI::errstr;
- }
-
- return $dbh;
- }
-
- sub to_mariadb($$){
- my ($export, $content) = @_;
-
- my $dbh = connect_db(
- "mysql",
- "$config->{'export'}->{$export}->{'connexion'}->{'base'}",
- "$config->{'export'}->{$export}->{'connexion'}->{'user'}",
- "$config->{'export'}->{$export}->{'connexion'}->{'password'}",
- "$config->{'export'}->{$export}->{'connexion'}->{'host'}"
- );
- print Dumper $config->{'export'}->{$export}->{'column_default'};
- my $columns_default = "";
- my $columns_default_value = "";
-
- if( exists($config->{'export'}->{$export}->{'column_default'}) ){
- $columns_default = "";
-
- foreach my $column ( keys %{$config->{'export'}->{$export}->{'column_default'}} ){
-
- $columns_default .= ", $column";
- print "\n- deal with $column\n";
- $columns_default_value .= ", \"$config->{'export'}->{$export}->{'column_default'}->{$column}\"";
- }
- }
-
- my $sql = "INSERT INTO $config->{'export'}->{$export}->{'content'}->{'table'} (
- $config->{'export'}->{$export}->{'content'}->{'column'} $columns_default
- )
- VALUES ('$content' $columns_default_value);";
- print $sql;
- exit;
- my $sth = $dbh->prepare( $sql );
- $sth->execute();
-
- $dbh->disconnect();
- }
-
- sub to_file($$){
- my($path, $content) = @_;
- my $filename = "$path";
- open(my $fh, '>', $filename) or die "Could not open file '$filename' $!";
- print $fh "$content";
- close $fh;
- print "done\n";
- }
-
- sub init (){
- return connect_db("SQLite", "history.db", "", "", "");
- }
-
- sub add_History($$$){
- my($url, $signature, $dbh) = @_;
- my $stmt = qq(INSERT INTO visit (url,signature)
- VALUES ("$url", "$signature"));
- my $rv = $dbh->do($stmt) or die $DBI::errstr;
- }
-
- sub get_html($){
- my ($url) = @_;
- my $response = HTTP::Tiny->new->get($url);
- if ($response->{success}) {
- return $response->{content};
- }
- return 0;
- }
-
- sub get_content($$){
- my($url, $selector) = @_;
- my $content = get_html( $url );
- my $dom = Mojo::DOM->new( $content );
- return $dom->at( "$selector" );
- }
-
- sub visit($$){
- my ($url, $dbh)= @_;
- my $content = get_html "$url" ;
- return sha256_hex( $content );
- }
-
- sub isNew($$$){
- my($url, $signature, $dbh) = @_;
-
- my $sth = $dbh->prepare("SELECT count(signature) AS COUNT FROM visit WHERE signature='$signature' AND url='$url';");
- #$sth->execute("$signature", "$url");
- $sth->execute();
- my $refs = $sth->fetchrow_arrayref()->[0];
-
- return !($refs);
- }
-
- sub registerIfNew($$$){
- my($url, $selector,$dbh) = @_;
- my $signature = sha256_hex( get_content($url, $selector) );
-
- if( isNew($url, $signature ,$dbh)){
- add_History($url, $signature ,$dbh);
- return 1;
- }
- return 0;
- }
-
- sub convert_strategy($$$){
- my ($content, $fromFormat, $toFormat) = @_;
-
- switch("$fromFormat $toFormat") {
- case "html md" { return htmlToMd( $content ) }
- case "a" { print "string a" }
- else { return $content }
- }
- }
-
- sub importContent($$$$){
- my ($url, $selector, $fromFormat, $toFormat) = @_;
- my $content = get_content( $url, $selector );
-
- if( $content ){
- return convertStrategy( $content, $fromFormat, $toFormat );
- }
- return 0;
- }
-
- sub checkUrl($$){
- my($url, $selector) = @_;
- my $dbh = init;
- if( registerIfNew($url, $selector,$dbh) ){
- get_content( $url, $selector )
- }
- $dbh->disconnect();
- }
-
- sub htmlToMd($){
- my($html) = @_;
- my $wc = new HTML::WikiConverter( dialect => 'Markdown' );
- return $wc->html2wiki( html => $html );
- }
-
- sub export_strategy($$$){
- my ($export, $format, $content) = @_;
- print "export straty de $export $config->{'export'}->{$export}->{'type'}";
-
- $content = convert_strategy(
- $content,
- $format,
- "$config->{'export'}->{$export}->{'format'}"
- );
- $content = String::Escape::backslash( $content );
-
- if( $config->{'export'}->{$export}->{'type'} eq "mariadb" ){
- print "\nis $config->{'export'}->{$export}->{'type'} mariadb?\n";
- to_mariadb( $export, $content );
- }else{
- to_file( "$config->{'export'}->{$export}->{'path'}", $content );
- }
- }
-
- sub main($$){
- my $dbh = init;
- my($import_list, $export) = @_;
-
- $config=do("./$export");
-
- open(FH, '<', $import_list) or die $!;
- while(<FH>){
- my($url, $selector, $fromFormat, $toFormat, $export) = split("\t");
- $fromFormat = trim( $fromFormat );
- $toFormat = trim( $toFormat );
- $export = trim( $export );
-
- my $content = get_content( $url, $selector );
-
- export_strategy( $export, $toFormat, $content );
- if( registerIfNew( $url, $selector, $dbh ) ){
- export_strategy( $export, $toFormat, $content );
- }
- }
- close(FH);
- }
-
- GetOptions (
- 'import-list=s' => \$import_list,
- 'export=s' => \$export
- );
-
- main( $import_list, $export );
|