扇贝背单词小助手

本文内容

思路

  • 英文单词的处理
  • shanbay 单词本的提交

英文单词的处理

正则表达式进行提取,然后通过 Python 的自然语言处理工具 ntlk 进行单词原型的转换

shanbay 单词的处理

由于使用了传说中的 requests,使得代码简洁程度大大提升。

代码如下

英文单词处理模块

shanbay 登录以及单词的提交

这是 v0.1 版本,有不少细节需要调整。有时间我会继续更新
代码 9 逻辑比较简单,自己阅读代码吧

请来 github 这里

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import re
__author__ = 'micheal'
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
## 常量设置
BOOKS_FOLDER = "Books/"
OUTPUTS_FOLDER = "OutPuts/"
COVER_FILE = "cover.jpg"
EXCLUDED_LIB_FILE = "excluded_libs.txt"
SUMMARY_FILE = "SUMMARY.json"
ALL_LIB_FILE = "all.txt"
### 初始化数据,
# 创建 ALLFile
def createMergeFile():
fo = open(ALL_LIB_FILE, 'w')
for name in os.listdir(BOOKS_FOLDER):
fi = open(BOOKS_FOLDER + name, "r")
while True:
s = fi.read(16 * 1024)
if not s:
break
fo.write(s)
fi.close()
fo.close()
pass
def get_sorted_words_list_from(txt_path):
with open(txt_path, "r") as f:
strs = f.read()
s = re.findall("\w+", str.lower(strs), flags=re.ASCII)
ss = []
for item in s:
ss.append(lemmatizer.lemmatize(item))
l = sorted(list(set(ss)))
ll = []
for i in l:
m = re.search("\d+", i)
n = re.search("\W+", i, flags=re.ASCII)
if not m and not n and len(i) > 4:
ll.append(i)
# 不属于数字也不属于非(英文 + 数字)并且字母长度大于 4 的集合
return ll
def WordCountInit():
createMergeFile()
excluded_words = get_sorted_words_list_from("excluded_libs.txt")
file = {}
folder_list = os.listdir(BOOKS_FOLDER)
for item in folder_list:
file[item] = get_sorted_words_list_from(BOOKS_FOLDER + item)
words = file[item]
real_words = []
for word in words:
if word not in excluded_words:
real_words.append(word)
# print("excluded_words\n"+str(excluded_words))
print("real_words\n" + str(real_words))
excluded_words.extend(file[item])
with open(OUTPUTS_FOLDER + item, "w") as f:
f.write(str(sorted(list(real_words))))
return True
# print(file)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from datetime import datetime
import json
import os
import requests
from WordsCount import get_sorted_words_list_from, WordCountInit
__author__ = 'micheal'
word_book_url = "http://www.shanbay.com/wordbook/99004/"
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
class ShanBay:
# 初始化 shanbay 的基本信息
# 1. 登录,
# 2,创建 wordbook
# 3,创建 wordlist,
# 4,用集合填充 wordlist
# 5,填充策略
#
def __init__(self):
print("基本数据正在初始化")
WordCountInit()
print("初始化完毕")
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
}
self.index_url = "http://www.shanbay.com/"
self.login_url = "http://www.shanbay.com/accounts/login/"
self.sb_session = requests.Session()
self.sb_session.headers.update(self.headers)
self.sb_session.get(self.login_url)
# 开始登录
# 初始化登录
print("正在登录中")
self.doLogin("test", "test")
print("单词书 情况如下 \n")
with open("SUMMARY.json", "r") as f:
decodejson = json.loads(f.read())
self.book_info_title = decodejson["title"]
self.book_info_category = decodejson["category"]
self.book_info_description = decodejson["description"]
self.book_info_price = decodejson["price"]
print(
"标题:" + self.book_info_title + "\n 类型:" + self.book_info_category + "\n 描述:" + self.book_info_description + "\n 价格:" + self.book_info_price)
self.createWordBook()
## 判断有多少个路进然后接着进行一系列的操作
word_list_list = os.listdir("Books/")
for word_list in word_list_list:
print("正在创建" + word_list)
dt = datetime.now()
list_id = self.createWordList(self.word_book_id, "List" + dt.strftime('%Y%m%d%H%M%S'),
"这仅仅是一个比较简单的描述" + dt.strftime('%Y%m%d%H%M%S'))
self.fillWordListById(list_id, get_sorted_words_list_from("OutPuts/" + word_list))
# print(self.sb_session.get(self.index_url).text)
def doLogin(self, username, password):
self.login_form = {
"csrfmiddlewaretoken": self.sb_session.cookies["csrftoken"],
"username": username,
"password": password,
}
self.sb_session.post(self.login_url, self.login_form)
return True
def fillWordListById(self, _id, words):
# "172981"
for word in words:
post_data = {
"id": _id,
"word": word,
}
self.sb_session.post("http://www.shanbay.com/api/v1/wordlist/vocabulary/", post_data)
return 0
def createWordBook(self):
# http://www.shanbay.com/wordbook/create/basicinfo/
createWordBookForm = {
"csrfmiddlewaretoken": self.sb_session.cookies["csrftoken"],
"title": self.book_info_title,
"category": self.book_info_category,
"description": self.book_info_description,
"price": self.book_info_price,
}
t = self.sb_session.post("http://www.shanbay.com/wordbook/create/basicinfo/", createWordBookForm)
self.word_book_id = str(t.url).split("/")[-3]
# 封面没有办法提交...... 以后再说
coverFiles = {
"csrfmiddlewaretoken": self.sb_session.cookies["csrftoken"],
"cover": ('cover.jpg', open('cover.jpg', 'rb'), 'image/jpeg', {'Expires': '0'}),
"wordbook_id": self.word_book_id,
"description": self.book_info_description,
}
# files = {'file': ('report.xls', open('report.xls', 'rb'), 'application/vnd.ms-excel', {'Expires': '0'})}
self.sb_session.post("http://www.shanbay.com/wordbook/create/" + self.word_book_id + "/uploadcover/",
files=coverFiles)
return True
def doMenu(self):
# while True:
#
# print("""\
#
# 请输入相关操作:
# 1.createWordBook
# 2.createWordList
#
#
# """)
# self.word_book_id = input("word_book_id")
#
#
#
pass
def createWordList(self, _id, name, desc):
createWordListFrom = {
"name": name,
"description": desc,
"wordbook_id": _id,
}
r = self.sb_session.post("http://www.shanbay.com/api/v1/wordbook/wordlist/", createWordListFrom)
return (json.loads(r.text)["data"]["wordlist"]["id"])
pass
sss = ShanBay()

这个程序还是比较简单的。

博客文章原创声明:
本博文章如果没有声明为整理或者转载,均为本人原创。非商业可以任意转载分享。
关于本人』(http://twocucao.xyz/about/),
点击链接就可以 web 幻灯片的方式看到我对自己的介绍。

我的 Github 地址:https://github.com/twocucao (尽管东西不多,但是欢迎来 Star 和 Fork,就算你们来这里提前 Star Folk 了)
简书地址:http://www.jianshu.com/users/9a7e0b9da317/latest_articles (不常更新,而且几乎没有技术文章的讲解)
联系方式:twocucao@gmail.com
本人才疏学浅,是一个水平比较菜的程序员,如果行文之间发现任何错误,欢迎指正,特别欢迎技术上的指正。