Back to Blogs
python
english

Check English Word

Soloman
2019-07-08

Check English Word with Python

1 PyEnchant

对英语单词进行拼写检查,并可以对拼写错误的单词推荐一些可能的正确单词,安装:

pip3 install pyenchant

# 在Linux服务器中安装失败,可能是缺少enchant组件,需要预先安装
sudo yum install enchant
# 安装常用英语字典
sudo yum install aspell-en
sudo yum install enchant-aspell

在PyEnchant中主要用Dict对象,可以使用它来检查单词的拼写是否正确,同时还可以对拼写错误的单词提供几个可能的正确拼写

import enchant


def is_all_english_alphabet(word):
    is_ap = word.isascii()
    is_wd = word.isalpha()
    ret = is_ap and is_wd
    return ret

def check_word_existence(word):
    d = enchant.Dict('en_US')
    ret = d.check(word)
    return ret

if __name__ == '__main__':
    while True:
        wd = input("请输入单词:")
        lower_wd = wd.strip().lower()
        if lower_wd == 'exit':
            break
        is_en = is_all_english_alphabet(lower_wd)
        print("全是英文字符吗:", is_en)
        exist = check_word_existence(lower_wd)
        print("真实存在该单词吗:", exist)

创建Dict对象可以使用如下方式,本地文件filename,文件中每一行只存放一个单词

方法描述
d = enchant.Dict(language)使用指定语言创建Dict对象
d = enchant.request_dict(language)使用指定语言创建Dict对象
d = enchant.request_pwl_dict(filename)只用本地文件中的词汇创建Dict对象
d = enchant.DictWithPWL(language, filename)将内置某语言以及本地文件中的词汇合并来创建Dict对象

enchant模块提供了如下语言的方法

方法or属性描述
d = enchant.Dict(language)指定语言创建一个Dict对象
d.tag当前Dict使用的语言
d.check(word)检查word的拼写是否正确
d.suggest(word)对拼写错误的word提供几个正确拼写的单词

Dict对象有如下方法与属性

方法描述
enchant.dict_exits(language)查看当前enchant模块是否支持某种语言
enchant.list_languages()查看当前enchant模块支持的所有语言

用enchant.checker中的SpellChecker类来对一整段文本中的单词进行拼写检查

from enchant.checker import SpellChecker


def check_paragraph(para):
    chkr = SpellChecker("en_US")
    chkr.set_text(para)
    for err in chkr:
        print("ERROR", err.word)

if __name__ == '__main__':
    content = "This is sme sample txt with erors."
    check_paragraph(content)

ERROR sme
ERROR txt
ERROR erors

分词器(Tokenization)进行分词,返回结果格式(word, pos),其中pos是word在整个文本中出现的位置

from enchant.tokenize import get_tokenizer


def tokenize_text(text):
    tknzr = get_tokenizer("en_US")
    ret = [wd for wd in tknzr(text)]
    print(ret)

if __name__ == '__main__':
    content = "This is some sample text written by Soloman."
    tokenize_text(content)

[('This', 0), ('is', 5), ('some', 8), ('sample', 13), ('text', 20), ('written', 25), ('by', 33), ('Soloman', 36)]

PyEnchant GitHub

2 自建单词库

使用自己的单词库文件创建单词集合,重点在于找到一份可靠的 english_words.txt

with open("english_words.txt") as word_file:
    english_words = set(word.strip().lower() for word in word_file)

def is_english_word(word):
    return word.lower() in english_words

if __name__ == '__main__':
    ret = is_english_word("hacker")
    print(ret)

3 NLTK + Wordnet

from nltk.corpus import wordnet

if not wordnet.synsets(word_to_test):
  #Not an English Word
else:
  #English Word

NLTK 官方文档

4 有道词典API

# -*- coding: utf-8 -*-
import sys
import uuid
import requests
import hashlib
import time
from imp import reload

reload(sys)

YOUDAO_URL = 'https://openapi.youdao.com/api'
APP_KEY = '您的应用ID'
APP_SECRET = '您的应用密钥'


def encrypt(signStr):
    hash_algorithm = hashlib.sha256()
    hash_algorithm.update(signStr.encode('utf-8'))
    return hash_algorithm.hexdigest()


def truncate(q):
    if q is None:
        return None
    size = len(q)
    return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size]


def do_request(data):
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    return requests.post(YOUDAO_URL, data=data, headers=headers)


def connect():
    q = "待输入的文字"

    data = {}
    data['from'] = '源语言'
    data['to'] = '目标语言'
    data['signType'] = 'v3'
    curtime = str(int(time.time()))
    data['curtime'] = curtime
    salt = str(uuid.uuid1())
    signStr = APP_KEY + truncate(q) + salt + curtime + APP_SECRET
    sign = encrypt(signStr)
    data['appKey'] = APP_KEY
    data['q'] = q
    data['salt'] = salt
    data['sign'] = sign
    data['vocabId'] = "您的用户词表ID"

    response = do_request(data)
    contentType = response.headers['Content-Type']
    if contentType == "audio/mp3":
        millis = int(round(time.time() * 1000))
        filePath = "合成的音频存储路径" + str(millis) + ".mp3"
        fo = open(filePath, 'wb')
        fo.write(response.content)
        fo.close()
    else:
        print(response.content)


if __name__ == '__main__':
    connect()

有道词典 API 官方文档