Batterie de script servant à synchroniser du contenu web
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

syncWeb.pl 5.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. #!/usr/bin/perl
  2. use strict;
  3. use Digest::SHA qw(sha256_hex);
  4. use HTTP::Tiny;
  5. use DBI qw(:sql_types);
  6. use Getopt::Long;
  7. use Mojo::DOM;
  8. use HTML::WikiConverter;
  9. use Switch;
  10. use String::Escape;
  11. use Data::Dumper::Perltidy;
  12. my $import_list;
  13. my $export;
  14. my $config;
  15. sub trim {
  16. (my $s = $_[0]) =~ s/^\s+|\s+$//g;
  17. return $s;
  18. }
  19. sub connect_db($$$$$){
  20. my ($driver, $database, $user, $password, $host) = @_;
  21. my $dbh = 0;
  22. my $dsn = "DBI:$driver:dbname=$database";
  23. if( $user && $password){
  24. $dbh = DBI->connect($dsn, $user, $password, { RaiseError => 1 })
  25. or die $DBI::errstr;
  26. }elsif( $user ){
  27. $dbh = DBI->connect($dsn, $user, { RaiseError => 1 })
  28. or die $DBI::errstr;
  29. }else{
  30. $dbh = DBI->connect($dsn, { RaiseError => 1 })
  31. or die $DBI::errstr;
  32. }
  33. return $dbh;
  34. }
  35. sub to_mariadb($$){
  36. my ($export, $content) = @_;
  37. connect_db(
  38. "mysql",
  39. "$config->{'export'}->{$export}->{'connexion'}->{'base'}",
  40. "$config->{'export'}->{$export}->{'connexion'}->{'user'}",
  41. "$config->{'export'}->{$export}->{'connexion'}->{'password'}",
  42. "$config->{'export'}->{$export}->{'connexion'}->{'host'}"
  43. );
  44. my $columns_default = "";
  45. my $columns_default_value = "";
  46. if( !"$config->{'export'}->{$export}->{'column_default'}" ){
  47. $columns_default = "";
  48. #for each...
  49. }
  50. print "INSERT INTO $config->{'export'}->{$export}->{'content'}->{'table'} (
  51. $config->{'export'}->{$export}->{'content'}->{'column'}
  52. $columns_default
  53. )
  54. VALUES ('$content' $columns_default_value);";
  55. # close db here
  56. }
  57. sub to_file($$){
  58. my($path, $content) = @_;
  59. my $filename = "$path";
  60. open(my $fh, '>', $filename) or die "Could not open file '$filename' $!";
  61. print $fh "$content";
  62. close $fh;
  63. print "done\n";
  64. }
  65. sub init (){
  66. return connect_db("SQLite", "history.db", "", "", "");
  67. }
  68. sub add_History($$$){
  69. my($url, $signature, $dbh) = @_;
  70. my $stmt = qq(INSERT INTO visit (url,signature)
  71. VALUES ("$url", "$signature"));
  72. my $rv = $dbh->do($stmt) or die $DBI::errstr;
  73. }
  74. sub get_html($){
  75. my ($url) = @_;
  76. my $response = HTTP::Tiny->new->get($url);
  77. if ($response->{success}) {
  78. return $response->{content};
  79. }
  80. return 0;
  81. }
  82. sub get_content($$){
  83. my($url, $selector) = @_;
  84. my $content = get_html( $url );
  85. my $dom = Mojo::DOM->new( $content );
  86. return $dom->at( "$selector" );
  87. }
  88. sub visit($$){
  89. my ($url, $dbh)= @_;
  90. my $content = get_html "$url" ;
  91. return sha256_hex( $content );
  92. }
  93. sub isNew($$$){
  94. my($url, $signature, $dbh) = @_;
  95. # my $sth = $dbh->prepare("SELECT count(signature) AS COUNT FROM visit WHERE signature=?1 AND url=?2;");
  96. my $sth = $dbh->prepare("SELECT count(signature) AS COUNT FROM visit WHERE signature='$signature' AND url='$url';");
  97. #$sth->execute("$signature", "$url");
  98. $sth->execute();
  99. my $refs = $sth->fetchrow_arrayref()->[0];
  100. return !($refs);
  101. }
  102. sub registerIfNew($$$){
  103. my($url, $selector,$dbh) = @_;
  104. my $signature = sha256_hex( get_content($url, $selector) );
  105. if( isNew($url, $signature ,$dbh)){
  106. add_History($url, $signature ,$dbh);
  107. return 1;
  108. }
  109. return 0;
  110. }
  111. sub convert_strategy($$$){
  112. my ($content, $fromFormat, $toFormat) = @_;
  113. switch("$fromFormat $toFormat") {
  114. case "html md" { return htmlToMd( $content ) }
  115. case "a" { print "string a" }
  116. else { return $content }
  117. }
  118. }
  119. sub importContent($$$$){
  120. my ($url, $selector, $fromFormat, $toFormat) = @_;
  121. my $content = get_content( $url, $selector );
  122. if( $content ){
  123. return convertStrategy( $content, $fromFormat, $toFormat );
  124. }
  125. return 0;
  126. }
  127. sub checkUrl($$){
  128. my($url, $selector) = @_;
  129. my $dbh = init;
  130. if( registerIfNew($url, $selector,$dbh) ){
  131. get_content( $url, $selector )
  132. }
  133. $dbh->disconnect();
  134. }
  135. sub htmlToMd($){
  136. my($html) = @_;
  137. my $wc = new HTML::WikiConverter( dialect => 'Markdown' );
  138. return $wc->html2wiki( html => $html );
  139. }
  140. sub export_strategy($$$){
  141. my ($export, $format, $content) = @_;
  142. print "export straty de $export $config->{'export'}->{$export}->{'type'}";
  143. $content = convert_strategy(
  144. $content,
  145. $format,
  146. "$config->{'export'}->{$export}->{'format'}"
  147. );
  148. $content = String::Escape::backslash( $content );
  149. if( $config->{'export'}->{$export}->{'type'} eq "mariadb" ){
  150. print "\nis $config->{'export'}->{$export}->{'type'} mariadb?\n";
  151. to_mariadb( $export, $content );
  152. }else{
  153. to_file( "$config->{'export'}->{$export}->{'path'}", $content );
  154. }
  155. }
  156. sub main($$){
  157. my $dbh = init;
  158. my($import_list, $export) = @_;
  159. $config=do("./$export");
  160. #$content = print encode 'unicode-escape',;
  161. my $test = 'la""la';
  162. print "+";
  163. print String::Escape::backslash( $test );
  164. print "\n";
  165. open(FH, '<', $import_list) or die $!;
  166. while(<FH>){
  167. my($url, $selector, $fromFormat, $toFormat, $export) = split("\t");
  168. $fromFormat = trim( $fromFormat );
  169. $toFormat = trim( $toFormat );
  170. $export = trim( $export );
  171. my $content = get_content( $url, $selector );
  172. export_strategy( $export, $toFormat, $content );
  173. if( registerIfNew( $url, $selector, $dbh ) ){
  174. export_strategy( $export, $toFormat, $content );
  175. }
  176. }
  177. close(FH);
  178. }
  179. GetOptions (
  180. 'import-list=s' => \$import_list,
  181. 'export=s' => \$export
  182. );
  183. main( $import_list, $export );