Browse Source

strategy de conversion, html vers md

nas 3 years ago
parent
commit
db5bb9f4b6
1 changed files with 77 additions and 34 deletions
  1. 77
    34
      syncWeb.pl

+ 77
- 34
syncWeb.pl View File

@@ -1,19 +1,29 @@
1 1
 #!/usr/bin/perl
2 2
 use strict;
3 3
 use Digest::SHA qw(sha256_hex);
4
-use LWP::UserAgent;
4
+use HTTP::Tiny;
5 5
 use DBI qw(:sql_types);
6 6
 use Getopt::Long;
7
-use pQuery;
7
+use Mojo::DOM;
8
+use HTML::WikiConverter;
9
+use Switch;
10
+#use Data::Dumper;
8 11
 
9
-sub init {
10
-    my $driver   = "SQLite";
11
-    my $database = "history.db";
12
+sub trim {
13
+    (my $s = $_[0]) =~ s/^\s+|\s+$//g;
14
+    return $s;
15
+}
16
+
17
+sub connect_db($$$){
18
+    my ($driver, $database, $password) = @_;
12 19
     my $dsn = "DBI:$driver:dbname=$database";
13 20
     my $dbh = DBI->connect($dsn, { RaiseError => 1 })
14 21
         or die $DBI::errstr;
15 22
     print "Opened database successfully\n";
16
-    return $dbh;
23
+}
24
+
25
+sub init {
26
+    return connect_db("SQLite", "history.db", "");
17 27
 }
18 28
 
19 29
 sub add_History($$$){
@@ -23,14 +33,20 @@ sub add_History($$$){
23 33
     my $rv = $dbh->do($stmt) or die $DBI::errstr;
24 34
 }
25 35
 
26
-sub get_html($) {
27
-    my $ua = new LWP::UserAgent;
28
-    $ua->timeout(120);
29
-    my $url = @_ ;
30
-    my $request = new HTTP::Request('GET', $url);
31
-    my $response = $ua->request($request);
32
-    my $content = $response->content();
33
-    return $content;
36
+sub get_html($){
37
+    my ($url) = @_;
38
+    my $response = HTTP::Tiny->new->get($url);
39
+    if ($response->{success}) {
40
+        return $response->{content};
41
+    }
42
+    return 0;
43
+}
44
+
45
+sub get_content($$){
46
+    my($url, $selector) = @_;
47
+    my $content = get_html( $url );
48
+    my $dom = Mojo::DOM->new( $content );
49
+    return $dom->at( "$selector" );
34 50
 }
35 51
 
36 52
 sub visit($$){
@@ -41,15 +57,19 @@ sub visit($$){
41 57
 
42 58
 sub isNew($$$){
43 59
     my($url, $signature, $dbh) = @_;
44
-    my $sth = $dbh->prepare("SELECT COUNT(signature) FROM visit WHERE signature=?1 AND url=?2");
45
-    $sth->execute($signature, $url);
46
-    my $refs = $sth->fetchall_arrayref()->[0][0];
47
-    return ($refs)? 0 : 1 ;
60
+    # my $sth = $dbh->prepare("SELECT count(signature) AS COUNT FROM visit WHERE signature=?1 AND url=?2;");
61
+    my $sth = $dbh->prepare("SELECT count(signature) AS COUNT FROM visit WHERE signature='$signature' AND url='$url';");
62
+    #$sth->execute("$signature", "$url");
63
+    $sth->execute();
64
+    my $refs = $sth->fetchrow_arrayref()->[0];
65
+    print "SELECT count(signature) AS COUNT FROM visit WHERE signature=$signature AND url=$url\n";
66
+    print "$url, count : $refs\n";
67
+    return !($refs);
48 68
 }
49 69
 
50
-sub registerIfNew($$){
51
-    my($url, $dbh) = @_;
52
-    my $signature = visit($url, $dbh);
70
+sub registerIfNew($$$){
71
+    my($url, $selector,$dbh) = @_;
72
+    my $signature = sha256_hex( get_content($url, $selector) );
53 73
 
54 74
     if( isNew($url, $signature ,$dbh)){
55 75
 	add_History($url, $signature ,$dbh);
@@ -58,32 +78,55 @@ sub registerIfNew($$){
58 78
     return 0;
59 79
 }
60 80
 
61
-sub importFromUrl($){
62
-    my ($url) = @_;
63
-    print "importing $url";
81
+sub convertStrategy($$$){
82
+    my ($content, $fromFormat, $toFormat) = @_;
83
+    
84
+    switch("$fromFormat $toFormat") {
85
+	case "html md"  { return htmlToMd( $content ) }
86
+	case "a"        { print "string a" }
87
+	else            { return $content }
88
+    }
89
+}
90
+
91
+sub importContent($$$$){
92
+    my ($url, $selector, $fromFormat, $toFormat) = @_;
93
+    my $content = get_content( $url, $selector );
94
+
95
+    if( $content ){
96
+	return convertStrategy( $content, $fromFormat, $toFormat );
97
+    }
98
+    return 0;
64 99
 }
65 100
 
66
-sub checkUrl($){
67
-    my($url) = @_;
101
+sub checkUrl($$){
102
+    my($url, $selector) = @_;
68 103
     my $dbh = init;
69
-    if( registerIfNew($url, $dbh) ){
70
-        importFromUrl($url);
104
+    if( registerIfNew($url, $selector,$dbh) ){
105
+        get_content( $url, $selector )
71 106
     }
72 107
     $dbh->disconnect();
73 108
 }
74 109
 
75
-sub get_content($$){
76
-    ;
110
+sub htmlToMd($){
111
+    my($html) = @_;
112
+    my $wc = new HTML::WikiConverter( dialect => 'Markdown' );
113
+    return $wc->html2wiki( html => $html );
77 114
 }
78 115
 
79 116
 sub main($){
117
+    my $dbh = init;
80 118
     my($file) = @_;
81
-    print "Using file $file\n";
82
-    #checkUrl( $url );
83 119
     open(FH, '<', $file) or die $!;
84 120
     while(<FH>){
85
-	print "Checking $_";
86
-	checkUrl( $_ );
121
+	my($url, $selector, $fromFormat, $toFormat) = split("\t");
122
+	$fromFormat = trim($fromFormat);
123
+	$toFormat = trim($toFormat);
124
+	#print get_content( $url, $selector );
125
+	#checkUrl( $_ );
126
+	#print registerIfNew( $url, $selector, $dbh );
127
+	print " $url, $selector, $fromFormat, $toFormat ";
128
+	print importContent( $url, $selector, $fromFormat, $toFormat );
129
+	print "\n"
87 130
     }
88 131
     close(FH);
89 132
 }

Loading…
Cancel
Save