# 背单词小助手

# 0x01 本文内容

# 思路

  • 英文单词的处理
  • shanbay 单词本的提交

# 英文单词的处理

正则表达式进行提取,然后通过 Python 的自然语言处理工具 ntlk 进行单词原型的转换

# shanbay 单词的处理

由于使用了传说中的 requests,使得代码简洁程度大大提升。

# 代码如下

英文单词处理模块

shanbay 登录以及单词的提交

这是 v0.1 版本,有不少细节需要调整。有时间我会继续更新
代码 9 逻辑比较简单,自己阅读代码吧

请来 github 这里

import os
import re

__author__ = 'micheal'

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

## 常量设置

BOOKS_FOLDER = "Books/"

OUTPUTS_FOLDER = "OutPuts/"

COVER_FILE = "cover.jpg"

EXCLUDED_LIB_FILE = "excluded_libs.txt"

SUMMARY_FILE = "SUMMARY.json"

ALL_LIB_FILE = "all.txt"

### 初始化数据,
# 创建 ALLFile
def createMergeFile():
    fo = open(ALL_LIB_FILE, 'w')
    for name in os.listdir(BOOKS_FOLDER):
        fi = open(BOOKS_FOLDER + name, "r")
        while True:
            s = fi.read(16 * 1024)
            if not s:
                break
            fo.write(s)
        fi.close()
    fo.close()

    pass

def get_sorted_words_list_from(txt_path):
    with open(txt_path, "r") as f:
        strs = f.read()
        s = re.findall("\w+", str.lower(strs), flags=re.ASCII)

    ss = []
    for item in s:
        ss.append(lemmatizer.lemmatize(item))

    l = sorted(list(set(ss)))
    ll = []

    for i in l:
        m = re.search("\d+", i)
        n = re.search("\W+", i, flags=re.ASCII)
        if not m and not n and len(i) > 4:
            ll.append(i)
            # 不属于数字也不属于非(英文 + 数字)并且字母长度大于 4 的集合

    return ll

def WordCountInit():
    createMergeFile()

    excluded_words = get_sorted_words_list_from("excluded_libs.txt")

    file = {}

    folder_list = os.listdir(BOOKS_FOLDER)

    for item in folder_list:
        file[item] = get_sorted_words_list_from(BOOKS_FOLDER + item)

        words = file[item]

        real_words = []

        for word in words:
            if word not in excluded_words:
                real_words.append(word)

        # print("excluded_words\n"+str(excluded_words))

        print("real_words\n" + str(real_words))

        excluded_words.extend(file[item])
        with open(OUTPUTS_FOLDER + item, "w") as f:
            f.write(str(sorted(list(real_words))))

    return True

    # print(file)

from datetime import datetime
import json
import os
import requests
from WordsCount import get_sorted_words_list_from, WordCountInit

__author__ = 'micheal'

word_book_url = "http://www.shanbay.com/wordbook/99004/"

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

class ShanBay:
    # 初始化 shanbay 的基本信息
    # 1. 登录,

    # 2,创建 wordbook
    # 3,创建 wordlist,
    # 4,用集合填充 wordlist
    # 5,填充策略
    #

    def __init__(self):
        print("基本数据正在初始化")
        WordCountInit()
        print("初始化完毕")

        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
        }

        self.index_url = "http://www.shanbay.com/"

        self.login_url = "http://www.shanbay.com/accounts/login/"

        self.sb_session = requests.Session()
        self.sb_session.headers.update(self.headers)
        self.sb_session.get(self.login_url)
        # 开始登录
        # 初始化登录

        print("正在登录中")
        self.doLogin("test", "test")

        print("单词书  情况如下 \n")
        with open("SUMMARY.json", "r") as f:
            decodejson = json.loads(f.read())
            self.book_info_title = decodejson["title"]
            self.book_info_category = decodejson["category"]
            self.book_info_description = decodejson["description"]
            self.book_info_price = decodejson["price"]
            print(
                "标题:" + self.book_info_title + "\n 类型:" + self.book_info_category + "\n 描述:" + self.book_info_description + "\n 价格:" + self.book_info_price)

        self.createWordBook()

        ## 判断有多少个路进然后接着进行一系列的操作
        word_list_list = os.listdir("Books/")
        for word_list in word_list_list:
            print("正在创建" + word_list)

            dt = datetime.now()

            list_id = self.createWordList(self.word_book_id, "List" + dt.strftime('%Y%m%d%H%M%S'),
                                          "这仅仅是一个比较简单的描述" + dt.strftime('%Y%m%d%H%M%S'))

            self.fillWordListById(list_id, get_sorted_words_list_from("OutPuts/" + word_list))

            # print(self.sb_session.get(self.index_url).text)

    def doLogin(self, username, password):
        self.login_form = {
            "csrfmiddlewaretoken": self.sb_session.cookies["csrftoken"],
            "username": username,
            "password": password,
        }

        self.sb_session.post(self.login_url, self.login_form)

        return True

    def fillWordListById(self, _id, words):
        # "172981"
        for word in words:
            post_data = {
                "id": _id,
                "word": word,
            }
            self.sb_session.post("http://www.shanbay.com/api/v1/wordlist/vocabulary/", post_data)

        return 0

    def createWordBook(self):
        # http://www.shanbay.com/wordbook/create/basicinfo/

        createWordBookForm = {
            "csrfmiddlewaretoken": self.sb_session.cookies["csrftoken"],
            "title": self.book_info_title,
            "category": self.book_info_category,
            "description": self.book_info_description,
            "price": self.book_info_price,
        }

        t = self.sb_session.post("http://www.shanbay.com/wordbook/create/basicinfo/", createWordBookForm)
        self.word_book_id = str(t.url).split("/")[-3]

        # 封面没有办法提交...... 以后再说
        coverFiles = {
            "csrfmiddlewaretoken": self.sb_session.cookies["csrftoken"],
            "cover": ('cover.jpg', open('cover.jpg', 'rb'), 'image/jpeg', {'Expires': '0'}),
            "wordbook_id": self.word_book_id,
            "description": self.book_info_description,
        }
        # files = {'file': ('report.xls', open('report.xls', 'rb'), 'application/vnd.ms-excel', {'Expires': '0'})}

        self.sb_session.post("http://www.shanbay.com/wordbook/create/" + self.word_book_id + "/uploadcover/",
                             files=coverFiles)

        return True

    def doMenu(self):
        # while True:
        #
        # print("""\
        #
        # 请输入相关操作:
        # 1.createWordBook
        # 2.createWordList
        #
        #
        #     """)
        #     self.word_book_id = input("word_book_id")
        #
        #
        #

        pass

    def createWordList(self, _id, name, desc):
        createWordListFrom = {
            "name": name,
            "description": desc,
            "wordbook_id": _id,

        }
        r = self.sb_session.post("http://www.shanbay.com/api/v1/wordbook/wordlist/", createWordListFrom)
        return (json.loads(r.text)["data"]["wordlist"]["id"])

        pass

sss = ShanBay()

这个程序还是比较简单的。


ChangeLog:

  • 2020-11-30 重修文字