正则表达式进行提取,然后通过 Python 的自然语言处理工具 ntlk 进行单词原型的转换
由于使用了传说中的 requests,使得代码简洁程度大大提升。
英文单词处理模块
shanbay 登录以及单词的提交
这是 v0.1 版本,有不少细节需要调整。有时间我会继续更新
代码 9 逻辑比较简单,自己阅读代码吧
import os
import re
__author__ = 'micheal'
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
## 常量设置
BOOKS_FOLDER = "Books/"
OUTPUTS_FOLDER = "OutPuts/"
COVER_FILE = "cover.jpg"
EXCLUDED_LIB_FILE = "excluded_libs.txt"
SUMMARY_FILE = "SUMMARY.json"
ALL_LIB_FILE = "all.txt"
### 初始化数据,
# 创建 ALLFile
def createMergeFile():
fo = open(ALL_LIB_FILE, 'w')
for name in os.listdir(BOOKS_FOLDER):
fi = open(BOOKS_FOLDER + name, "r")
while True:
s = fi.read(16 * 1024)
if not s:
break
fo.write(s)
fi.close()
fo.close()
pass
def get_sorted_words_list_from(txt_path):
with open(txt_path, "r") as f:
strs = f.read()
s = re.findall("\w+", str.lower(strs), flags=re.ASCII)
ss = []
for item in s:
ss.append(lemmatizer.lemmatize(item))
l = sorted(list(set(ss)))
ll = []
for i in l:
m = re.search("\d+", i)
n = re.search("\W+", i, flags=re.ASCII)
if not m and not n and len(i) > 4:
ll.append(i)
# 不属于数字也不属于非(英文 + 数字)并且字母长度大于 4 的集合
return ll
def WordCountInit():
createMergeFile()
excluded_words = get_sorted_words_list_from("excluded_libs.txt")
file = {}
folder_list = os.listdir(BOOKS_FOLDER)
for item in folder_list:
file[item] = get_sorted_words_list_from(BOOKS_FOLDER + item)
words = file[item]
real_words = []
for word in words:
if word not in excluded_words:
real_words.append(word)
# print("excluded_words\n"+str(excluded_words))
print("real_words\n" + str(real_words))
excluded_words.extend(file[item])
with open(OUTPUTS_FOLDER + item, "w") as f:
f.write(str(sorted(list(real_words))))
return True
# print(file)
from datetime import datetime
import json
import os
import requests
from WordsCount import get_sorted_words_list_from, WordCountInit
__author__ = 'micheal'
word_book_url = "http://www.shanbay.com/wordbook/99004/"
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
class ShanBay:
# 初始化 shanbay 的基本信息
# 1. 登录,
# 2,创建 wordbook
# 3,创建 wordlist,
# 4,用集合填充 wordlist
# 5,填充策略
#
def __init__(self):
print("基本数据正在初始化")
WordCountInit()
print("初始化完毕")
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
}
self.index_url = "http://www.shanbay.com/"
self.login_url = "http://www.shanbay.com/accounts/login/"
self.sb_session = requests.Session()
self.sb_session.headers.update(self.headers)
self.sb_session.get(self.login_url)
# 开始登录
# 初始化登录
print("正在登录中")
self.doLogin("test", "test")
print("单词书 情况如下 \n")
with open("SUMMARY.json", "r") as f:
decodejson = json.loads(f.read())
self.book_info_title = decodejson["title"]
self.book_info_category = decodejson["category"]
self.book_info_description = decodejson["description"]
self.book_info_price = decodejson["price"]
print(
"标题:" + self.book_info_title + "\n 类型:" + self.book_info_category + "\n 描述:" + self.book_info_description + "\n 价格:" + self.book_info_price)
self.createWordBook()
## 判断有多少个路进然后接着进行一系列的操作
word_list_list = os.listdir("Books/")
for word_list in word_list_list:
print("正在创建" + word_list)
dt = datetime.now()
list_id = self.createWordList(self.word_book_id, "List" + dt.strftime('%Y%m%d%H%M%S'),
"这仅仅是一个比较简单的描述" + dt.strftime('%Y%m%d%H%M%S'))
self.fillWordListById(list_id, get_sorted_words_list_from("OutPuts/" + word_list))
# print(self.sb_session.get(self.index_url).text)
def doLogin(self, username, password):
self.login_form = {
"csrfmiddlewaretoken": self.sb_session.cookies["csrftoken"],
"username": username,
"password": password,
}
self.sb_session.post(self.login_url, self.login_form)
return True
def fillWordListById(self, _id, words):
# "172981"
for word in words:
post_data = {
"id": _id,
"word": word,
}
self.sb_session.post("http://www.shanbay.com/api/v1/wordlist/vocabulary/", post_data)
return 0
def createWordBook(self):
# http://www.shanbay.com/wordbook/create/basicinfo/
createWordBookForm = {
"csrfmiddlewaretoken": self.sb_session.cookies["csrftoken"],
"title": self.book_info_title,
"category": self.book_info_category,
"description": self.book_info_description,
"price": self.book_info_price,
}
t = self.sb_session.post("http://www.shanbay.com/wordbook/create/basicinfo/", createWordBookForm)
self.word_book_id = str(t.url).split("/")[-3]
# 封面没有办法提交...... 以后再说
coverFiles = {
"csrfmiddlewaretoken": self.sb_session.cookies["csrftoken"],
"cover": ('cover.jpg', open('cover.jpg', 'rb'), 'image/jpeg', {'Expires': '0'}),
"wordbook_id": self.word_book_id,
"description": self.book_info_description,
}
# files = {'file': ('report.xls', open('report.xls', 'rb'), 'application/vnd.ms-excel', {'Expires': '0'})}
self.sb_session.post("http://www.shanbay.com/wordbook/create/" + self.word_book_id + "/uploadcover/",
files=coverFiles)
return True
def doMenu(self):
# while True:
#
# print("""\
#
# 请输入相关操作:
# 1.createWordBook
# 2.createWordList
#
#
# """)
# self.word_book_id = input("word_book_id")
#
#
#
pass
def createWordList(self, _id, name, desc):
createWordListFrom = {
"name": name,
"description": desc,
"wordbook_id": _id,
}
r = self.sb_session.post("http://www.shanbay.com/api/v1/wordbook/wordlist/", createWordListFrom)
return (json.loads(r.text)["data"]["wordlist"]["id"])
pass
sss = ShanBay()
这个程序还是比较简单的。
ChangeLog: