comparison bin/chronicle-rss-importer @ 222:fdbbc1bb5d7c

Add the stub migration script + readme
author Steve Kemp <steve@steve.org.uk>
date Wed, 16 Apr 2008 19:31:23 +0100
parents
children 389f234d4111
comparison
equal deleted inserted replaced
221:eded23ae8288 222:fdbbc1bb5d7c
1 #!/usr/bin/perl -w
2
3 use strict;
4 use warnings;
5
6 use Getopt::Long;
7 use HTML::Entities;
8 use LWP;
9 use XML::RSSLite;
10
11
12 #
13 # Configuration variables
14 #
15 my %CONFIG;
16
17
18 #
19 # Parse arguments.
20 #
21 parseCommandLineArguments();
22
23
24 #
25 # Validate any arguments.
26 #
27 validateCommandLineArguments();
28
29
30 #
31 # Fetch the feed.
32 #
33 my $content = fetchRSSFeed( $CONFIG{ 'feed' } );
34
35
36 #
37 # Parse the feed
38 #
39 my %rssHash;
40 parseRSS( \%rssHash, \$content );
41
42
43 #
44 # Now import
45 #
46 processEntries(%rssHash);
47
48
49 #
50 # All done.
51 #
52
53
54
55 =begin doc
56
57 Parse the command line arguments, if any.
58
59 =end doc
60
61 =cut
62
63 sub parseCommandLineArguments
64 {
65 GetOptions(
66
67 # Help options
68 "feed=s", \$CONFIG{ 'feed' },
69 "output=s", \$CONFIG{ 'output' },
70 "sequential", \$CONFIG{ 'sequential' },
71 );
72 }
73
74
75
76 =begin doc
77
78 Ensure we received the arguments we need, and that
79 those arguments look OK.
80
81 =end doc
82
83 =cut
84
85 sub validateCommandLineArguments
86 {
87
88 #
89 # We need an output dir
90 #
91 if ( !$CONFIG{ 'output' } )
92 {
93 print "Output directory is mandatory.\n";
94 print "Please specificy via --output=...\n";
95 exit;
96 }
97 if ( !-d $CONFIG{ 'output' } )
98 {
99 print
100 "Specified output directory [$CONFIG{'output'}] is not a directory!\n";
101 exit;
102 }
103
104 #
105 # We need a feed
106 #
107 if ( !$CONFIG{ 'feed' } )
108 {
109 print "Please specify a feed to import, via --feed=http:/....\n";
110 exit;
111 }
112
113 }
114
115
116
117 =begin doc
118
119 Fetch the remote RSS feed.
120
121 =end doc
122
123 =cut
124
125 sub fetchRSSFeed
126 {
127 my ($uri) = (@_);
128
129 my $ua = LWP::UserAgent->new();
130 $ua->timeout(10);
131 $ua->agent('chronicle-importer');
132
133 my $response = $ua->get($uri);
134
135 if ( $response->is_success )
136 {
137 return ( $response->content() );
138 }
139 else
140 {
141 print "Failed to fetch feed: $uri\n";
142 print "\n";
143 print $response->message() . "\n";
144 exit;
145 }
146
147 }
148
149
150
151 =begin doc
152
153 Iterate over the items in our feed and write each one out to a
154 single file.
155
156 =end doc
157
158 =cut
159
160 sub processEntries
161 {
162 my (%entries) = (@_);
163
164 my $count = 1;
165
166 foreach my $item ( @{ $rssHash{ 'item' } } )
167 {
168
169 #
170 # Get details from the feed.
171 #
172 my $title = $item->{ 'title' } || "no title";
173 my $date = $item->{ 'pubDate' } || $item->{ 'dc:date' } || undef;
174 my $body =
175 $item->{ 'description' }
176 || $item->{ 'content:encoded' }
177 || undef;
178 my $filename;
179
180
181 #
182 # Build up a suitable filename.
183 #
184 if ( $CONFIG{ 'sequential' } )
185 {
186 $filename = $count . ".txt";
187 }
188 else
189 {
190 $filename = $title;
191 $filename =~ s/[^a-z0-9]/_/gi;
192 $filename .= ".txt";
193
194 }
195
196 #
197 # Naive expansion.
198 #
199 if ( $body =~ m/&lt;/ )
200 {
201 $body = decode_entities($body);
202 }
203 $filename = $CONFIG{ 'output' } . "/" . $filename;
204 open( OUTPUT, ">", $filename )
205 or die "Failed to write to $filename - $!";
206 print OUTPUT <<EOF;
207 Title: $title
208 Date: $date
209
210 $body
211 EOF
212 close(OUTPUT);
213 $count += 1;
214 }
215
216 }