-
Notifications
You must be signed in to change notification settings - Fork 1
/
Sousouwa_fetch_kobo.py
129 lines (106 loc) · 3.66 KB
/
Sousouwa_fetch_kobo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import htmlentitydefs
import codecs
import urllib2
from BeautifulSoup import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# 実体参照 & 文字参照を通常の文字に戻す
def htmlentity2unicode(text):
# 正規表現のコンパイル
reference_regex = re.compile(u'&(#x?[0-9a-f]+|[a-z]+);', re.IGNORECASE)
num16_regex = re.compile(u'#x\d+', re.IGNORECASE)
num10_regex = re.compile(u'#\d+', re.IGNORECASE)
result = u''
i = 0
while True:
# 実体参照 or 文字参照を見つける
match = reference_regex.search(text, i)
if match is None:
result += text[i:]
break
result += text[i:match.start()]
i = match.end()
name = match.group(1)
# 実体参照
if name in htmlentitydefs.name2codepoint.keys():
result += unichr(htmlentitydefs.name2codepoint[name])
# 文字参照
elif num16_regex.match(name):
# 16進数
result += unichr(int(u'0'+name[1:], 16))
elif num10_regex.match(name):
# 10進数
result += unichr(int(name[1:]))
return result
def removeSpaceEntity(s):
pattern = re.compile('( | | )')
return pattern.sub('', s)
def convertEntity(s):
pattern = re.compile('(&#([0-9a-fA-F]{4});)')
return pattern.sub(lambda x: unichr(int(x.group(2), 16)), s)
def convertSousouwaEntity(s):
pattern = re.compile(r''')
tmp = pattern.sub('\'', s)
pattern = re.compile(r''')
tmp = pattern.sub('\'', s)
return tmp
def brRemove(s):
brRemover = re.compile(r'<br.*?>')
return brRemover.sub('', s)
def divRemove(s):
divRemover = re.compile(r'(<div.*?>|</div>)')
return divRemover.sub('', s)
def tagRemove(s):
tagRemover = re.compile(r'(<.*?>|</.*?>)')
return tagRemover.sub('', s)
def spaceRemove(s):
spaceRemover = re.compile(r'^( | | |\n)+')
tmp = spaceRemover.sub('', s)
spaceRemover = re.compile(r'( | | |\n)+$')
return spaceRemover.sub('', tmp)
def unescape(s):
s = removeSpaceEntity(s)
s = convertEntity(s)
s = tagRemove(s)
return s
def titleEscape(s):
s = tagRemove(s)
s = spaceRemove(s)
s = convertEntity(s)
s = convertSousouwaEntity(s)
return s
def main():
argvs = sys.argv
url = sys.argv[1]
# url = 'http://coolier-new.sytes.net:8080/sosowa/ssw_l/?mode=read&key=1342594292&log=0'
html = urllib2.urlopen(url).read().decode('cp932')
soup = BeautifulSoup(html)
title = soup.find('title')
body = soup.find("div", attrs={"class": "contents ss"})
aft = soup.find("div", attrs={"class": "aft"})
# title = unescape(str(title))
body = unescape(str(body))
aft = unescape(str(aft))
title = titleEscape(str(title))
#書き出す先のフォルダがあるかどうか確認し、なければ作成
if os.path.exists("./sousouwa_ss") == False:
os.mkdir('./sousouwa_ss')
filename = './sousouwa_ss/' + title + '.html'
#koboはhtmlとして認識させないと文字化けするので、htmlスニペットを用意
htmlWrapBefore = '<?xml version="1.0" encoding="Shift_JIS"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja" ><body><pre>'
htmlWrapAfter = "</pre></body></html>"
#ファイルに書き出す
sjisWriter = codecs.open(filename, 'w', 'cp932')
sjisWriter.write(htmlWrapBefore)
sjisWriter.write(body) # 引数の文字列をファイルに書き込む
sjisWriter.write(htmlWrapAfter)
#処理完了をコンソールに伝える
print title
print 'is downloaded successfully.'
if __name__ == '__main__':
main()