news4 - RSS aggrigation system
Révision | e1b0b09a5f9eb53b88d3cd0618adf0d5cbaaee7d (tree) |
---|---|
l'heure | 2012-10-02 05:06:17 |
Auteur | hylom <hylom@hylo...> |
Commiter | hylom |
add tagging filter
@@ -3,8 +3,17 @@ include install.conf | ||
3 | 3 | |
4 | 4 | DEPENDS=install.conf |
5 | 5 | |
6 | -all: install.conf | |
6 | +all: install.conf keywords.py | |
7 | + python gnews.py | |
8 | + | |
9 | +install: install.conf | |
7 | 10 | rsync -av css/ $(INSTALL_DIR)/css |
8 | 11 | rsync -av js/ $(INSTALL_DIR)/js |
9 | 12 | rsync -av img/ $(INSTALL_DIR)/img |
10 | - ./gnews.py | |
13 | + | |
14 | +keywords.py: keywords.txt | |
15 | + python genkeywords.py < $< > $@ | |
16 | + | |
17 | +clean: | |
18 | + rm -f $(INSTALL_DIR)/*.html | |
19 | + rm -rf $(INSTALL_DIR)/tag/* |
@@ -1,4 +1,5 @@ | ||
1 | 1 | # config.py |
2 | +# -*- coding: utf-8 -*- | |
2 | 3 | # configuration for gnews.py |
3 | 4 | |
4 | 5 | config = { |
@@ -15,7 +16,7 @@ config = { | ||
15 | 16 | 'output_directory': 'outputs/tag', |
16 | 17 | }, |
17 | 18 | 'site_parameter': { |
18 | - 'name': 'OpenSource Antenna', | |
19 | + 'name': 'SourceForge.JP Antenna', | |
19 | 20 | 'css_directory': '/css', |
20 | 21 | 'js_directory': '/js', |
21 | 22 | 'tag_directory': '/tag', |
@@ -33,12 +34,49 @@ target_rss = [ | ||
33 | 34 | 'name': 'Slashdot Japan', |
34 | 35 | 'url': 'http://rss.rssad.jp/rss/slashdot/slashdot.rss', |
35 | 36 | 'source_url': 'http://slashdot.jp/', |
36 | - 'filter': 'slashdotjp', | |
37 | + 'filter': ['slashdotjp',], | |
37 | 38 | }, |
38 | 39 | { |
39 | 40 | 'name': 'ITmedia', |
40 | 41 | 'url': 'http://rss.rssad.jp/rss/itmtop/2.0/itmedia_all.xml', |
41 | 42 | 'source_url': 'http://www.itmedia.co.jp/', |
43 | + 'filter': ['tagging',], | |
44 | + }, | |
45 | + { | |
46 | + 'name': 'So-netセキュリティ通信', | |
47 | + 'url': 'http://security-t.blog.so-net.ne.jp/index.xml', | |
48 | + 'source_url': 'http://security-t.blog.so-net.ne.jp/', | |
49 | + 'filter': ['tagging',], | |
50 | + }, | |
51 | + { | |
52 | + 'name': 'Engadget Japanese', | |
53 | + 'url': 'http://japanese.engadget.com/rss.xml', | |
54 | + 'source_url': 'http://japanese.engadget.com/', | |
55 | + 'filter': [ | |
56 | + 'tagging', | |
57 | + 'extractimg', | |
58 | + ], | |
59 | + }, | |
60 | + { | |
61 | + 'name': 'ギズモード・ジャパン', | |
62 | + 'url': 'http://feeds.gizmodo.jp/rss/gizmodo/index.xml', | |
63 | + 'source_url': 'http://www.gizmodo.jp/', | |
64 | + 'filter': ['tagging',], | |
65 | + }, | |
66 | + { | |
67 | + 'name': 'TechCrunch Japan', | |
68 | + 'url': 'http://jp.techcrunch.com/feed/', | |
69 | + 'source_url': 'http://jp.techcrunch.com/', | |
70 | + 'filter': ['tagging',], | |
42 | 71 | }, |
43 | 72 | ] |
44 | 73 | |
74 | + | |
75 | +"""Template: | |
76 | + { | |
77 | + 'name': '', | |
78 | + 'url': '', | |
79 | + 'source_url': '', | |
80 | + 'filter': ['tagging',], | |
81 | + }, | |
82 | +""" |
@@ -27,4 +27,12 @@ header { | ||
27 | 27 | font-weight: normal; |
28 | 28 | } |
29 | 29 | |
30 | - | |
\ No newline at end of file | ||
30 | +.entry-footer { | |
31 | + border-bottom: 1px solid #D6D6D6; | |
32 | + margin-bottom: 1em; | |
33 | +} | |
34 | + | |
35 | +.entry-header h3 { | |
36 | + line-height: 130%; | |
37 | + margin: 10px auto 1em; | |
38 | +} | |
\ No newline at end of file |
@@ -42,15 +42,16 @@ class FeedFetcher(object): | ||
42 | 42 | entries = [x for x in entries if x] |
43 | 43 | |
44 | 44 | if 'filter' in self._feed: |
45 | - entry_filter = self._get_filter() | |
46 | - entries = [entry_filter(x) for x in entries] | |
45 | + filters = self._feed.get('filter', None) | |
46 | + for filter in filters: | |
47 | + entry_filter = self._get_filter(filter) | |
48 | + entries = [entry_filter(x) for x in entries] | |
47 | 49 | # remove entry which is None |
48 | 50 | entries = [x for x in entries if x] |
49 | 51 | return entries |
50 | 52 | |
51 | - def _get_filter(self): | |
53 | + def _get_filter(self, filter_name): | |
52 | 54 | 'load filter by seed settings' |
53 | - filter_name = self._feed.get('filter', None) | |
54 | 55 | |
55 | 56 | # fallback when filter isn't defined |
56 | 57 | if filter_name is None: |
@@ -0,0 +1,27 @@ | ||
1 | +# filter for Image extraction | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_imgtag = re.compile(ur'''<img[^>]*src=["']([^'"]*?)["'].*?>''') | |
7 | + | |
8 | +def entry_filter(entry): | |
9 | + # 画像を削除 | |
10 | + body = entry['body'] | |
11 | + images = [] | |
12 | + m = re_imgtag.search(body) | |
13 | + if m: | |
14 | + itr = re_imgtag.findall(body) | |
15 | + for url in itr: | |
16 | + images.append(url) | |
17 | + body = re_imgtag.sub('', body) | |
18 | + | |
19 | + if len(images) > 0: | |
20 | + if 'images' in entry: | |
21 | + entry['images'].extend(images) | |
22 | + else: | |
23 | + entry['images'] = images | |
24 | + entry['body'] = body | |
25 | + | |
26 | + return entry | |
27 | + |
@@ -0,0 +1,26 @@ | ||
1 | +# filter for slashdot.jp | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +#import re | |
5 | +from keywords import keywords | |
6 | + | |
7 | +def entry_filter(entry): | |
8 | + # キーワード文字列が含まれていればそれをタグに設定する | |
9 | + body = entry['body'] | |
10 | + title = entry['title'] | |
11 | + tags = [] | |
12 | + for keyword in keywords: | |
13 | + keyname = keyword[0] | |
14 | + for key in keyword: | |
15 | + if body.find(key) >= 0 or title.find(key) >= 0: | |
16 | + tags.append(keyname) | |
17 | + break | |
18 | + | |
19 | + if 'tags' in entry: | |
20 | + entry['tags'].extend(tags) | |
21 | + else: | |
22 | + entry['tags'] = tags | |
23 | + entry['body'] = body | |
24 | + | |
25 | + return entry | |
26 | + |
@@ -0,0 +1,22 @@ | ||
1 | +#!/usr/bin/python | |
2 | + | |
3 | +import sys | |
4 | +import re | |
5 | + | |
6 | +HEADER = """# | |
7 | +# -*- coding: utf-8 -*- | |
8 | + | |
9 | +keywords = [""" | |
10 | + | |
11 | +FOOTER = """]""" | |
12 | + | |
13 | +print HEADER | |
14 | + | |
15 | +for l in sys.stdin: | |
16 | + l = l.strip() | |
17 | + terms = l.split(',') | |
18 | + t = " (u'" + "',u'".join(terms) + "',)," | |
19 | + print t | |
20 | + | |
21 | +print FOOTER | |
22 | + |
@@ -0,0 +1,152 @@ | ||
1 | +AMD | |
2 | +Amiga | |
3 | +Android | |
4 | +apache | |
5 | +BeOS | |
6 | +Blackberry | |
7 | +BSD | |
8 | +Caldera | |
9 | +Chrome | |
10 | +Chromium | |
11 | +Comdex | |
12 | +Compaq | |
13 | +Debian | |
14 | +Digital | |
15 | +DRM | |
16 | +EFF | |
17 | +enlightenment | |
18 | +EU | |
19 | ||
20 | +Firefox | |
21 | +GNOME | |
22 | +GNU | |
23 | ||
24 | +GUI | |
25 | +HP | |
26 | +IBM | |
27 | +idle | |
28 | +Intel | |
29 | +iOS | |
30 | +iPhone | |
31 | +IT | |
32 | +Java | |
33 | +JAXA | |
34 | +KDE | |
35 | +Linux | |
36 | +Mandrake | |
37 | +Linuxcare | |
38 | +Mac OS X,MacOS X | |
39 | +Mozilla | |
40 | +Namazu | |
41 | +NASA | |
42 | +Novell | |
43 | +NTT | |
44 | +Opera | |
45 | +Oracle | |
46 | +OS | |
47 | +Perl | |
48 | +PHP | |
49 | +Python | |
50 | +Quake | |
51 | +Ruby | |
52 | +Safari | |
53 | +SGI | |
54 | +SNS | |
55 | +Sony,ソニー | |
56 | +spam | |
57 | +SuSE | |
58 | +Gimp | |
59 | +Transmeta | |
60 | +TRON | |
61 | ||
62 | +Ubuntu | |
63 | +UNIX | |
64 | +Wikipedia,ウィキペディア | |
65 | +Windows | |
66 | +Windows Azure | |
67 | +Wine | |
68 | +Ximian | |
69 | +Yahoo,ヤフー | |
70 | +YouTube | |
71 | +Apple,アップル | |
72 | +インターネット | |
73 | +Internet Explorer,インターネットエクスプローラ | |
74 | +Open Source,OpenSource,オープンソース | |
75 | +ガンダム | |
76 | +Cloud,クラウド | |
77 | +game,ゲーム | |
78 | +Corel,コーレル | |
79 | +Star Wars,スターウオーズ | |
80 | +Startrek,スタートレック | |
81 | +Storage,ストレージ | |
82 | +スパコン | |
83 | +Slashdot,スラッシュドット | |
84 | +セキュリティ | |
85 | +ソフトウェア | |
86 | +ターボリナックス | |
87 | +TV,テレビ | |
88 | +データベース | |
89 | +Netscape,ネットスケープ | |
90 | +ネットワーク | |
91 | +ノートPC,ノートパソコン | |
92 | +ハンドヘルド | |
93 | +ハードウェア | |
94 | +ハードウェアハック | |
95 | +バイオテック | |
96 | +バグ | |
97 | +特許,パテント | |
98 | +ビジネス | |
99 | +ビール | |
100 | +プライバシ | |
101 | +プリンタ | |
102 | +プログラミング | |
103 | +ボットネット | |
104 | +Microsoft,マイクロソフト | |
105 | +メディア | |
106 | +モニター | |
107 | +モバイル | |
108 | +リンク | |
109 | +Red Hat,レッドハット | |
110 | +ロボット | |
111 | +ワーム | |
112 | +中国 | |
113 | +交通 | |
114 | +AI,人工知能 | |
115 | +仮想化 | |
116 | +任天堂 | |
117 | +入力デバイス | |
118 | +医療 | |
119 | +原子力 | |
120 | +IIS,国際宇宙ステーション | |
121 | +地球 | |
122 | +地震 | |
123 | +娯楽 | |
124 | +宇宙 | |
125 | +広告 | |
126 | +情報漏洩 | |
127 | +携帯通信 | |
128 | +携帯電話 | |
129 | +政府 | |
130 | +政治 | |
131 | +教育 | |
132 | +数学 | |
133 | +日本 | |
134 | +日記 | |
135 | +映画 | |
136 | +暗号 | |
137 | +書籍 | |
138 | +検閲 | |
139 | +法廷 | |
140 | +海賊行為 | |
141 | +火星 | |
142 | +犯罪 | |
143 | +統計 | |
144 | +英国 | |
145 | +著作権 | |
146 | +軍事 | |
147 | +通信 | |
148 | +電力 | |
149 | +音楽 | |
150 | +スマートフォン,スマホ,スマートホン | |
151 | +グラフィックカード,グラフィックスカード,GPU | |
152 | +NTTドコモ,ドコモ |
@@ -58,17 +58,24 @@ | ||
58 | 58 | <div class="entry-continue"> |
59 | 59 | <a href='${entry.link}'>[続きを読む]</a> |
60 | 60 | </div> |
61 | - <div class="entry-footer"> | |
61 | + <div class="information"> | |
62 | 62 | <span>情報元:<a href='${entry.feed.source_url}'>${entry.feed.name}</a></span> |
63 | 63 | <span>(${date_format(entry.date)})</span> |
64 | 64 | <span>タグ:</span> |
65 | 65 | % for tag in entry.tags: |
66 | 66 | <span>${tag} </span> |
67 | + % if 'images' in entry: | |
68 | + <span>画像:</span> | |
69 | + % for imgurl in entry.images: | |
70 | + <span><a href="${imgurl}">*</a></span> | |
71 | + % endfor | |
72 | + % endif | |
67 | 73 | % endfor |
68 | 74 | </div> |
69 | 75 | </div> |
70 | 76 | </div> |
71 | 77 | % endfor |
78 | + | |
72 | 79 | <div class="pagination pagination-centered"> |
73 | 80 | <ul> |
74 | 81 | % for page in range(1, params.page.total + 1): |