Check English Word with Python
1 PyEnchant
对英语单词进行拼写检查,并可以对拼写错误的单词推荐一些可能的正确单词,安装:
pip3 install pyenchant
# 在Linux服务器中安装失败,可能是缺少enchant组件,需要预先安装
sudo yum install enchant
# 安装常用英语字典
sudo yum install aspell-en
sudo yum install enchant-aspell
在PyEnchant中主要用Dict对象,可以使用它来检查单词的拼写是否正确,同时还可以对拼写错误的单词提供几个可能的正确拼写
import enchant
def is_all_english_alphabet(word):
is_ap = word.isascii()
is_wd = word.isalpha()
ret = is_ap and is_wd
return ret
def check_word_existence(word):
d = enchant.Dict('en_US')
ret = d.check(word)
return ret
if __name__ == '__main__':
while True:
wd = input("请输入单词:")
lower_wd = wd.strip().lower()
if lower_wd == 'exit':
break
is_en = is_all_english_alphabet(lower_wd)
print("全是英文字符吗:", is_en)
exist = check_word_existence(lower_wd)
print("真实存在该单词吗:", exist)
创建Dict对象可以使用如下方式,本地文件filename,文件中每一行只存放一个单词
| 方法 | 描述 |
|---|---|
| d = enchant.Dict(language) | 使用指定语言创建Dict对象 |
| d = enchant.request_dict(language) | 使用指定语言创建Dict对象 |
| d = enchant.request_pwl_dict(filename) | 只用本地文件中的词汇创建Dict对象 |
| d = enchant.DictWithPWL(language, filename) | 将内置某语言以及本地文件中的词汇合并来创建Dict对象 |
enchant模块提供了如下语言的方法
| 方法or属性 | 描述 |
|---|---|
| d = enchant.Dict(language) | 指定语言创建一个Dict对象 |
| d.tag | 当前Dict使用的语言 |
| d.check(word) | 检查word的拼写是否正确 |
| d.suggest(word) | 对拼写错误的word提供几个正确拼写的单词 |
Dict对象有如下方法与属性
| 方法 | 描述 |
|---|---|
| enchant.dict_exits(language) | 查看当前enchant模块是否支持某种语言 |
| enchant.list_languages() | 查看当前enchant模块支持的所有语言 |
用enchant.checker中的SpellChecker类来对一整段文本中的单词进行拼写检查
from enchant.checker import SpellChecker
def check_paragraph(para):
chkr = SpellChecker("en_US")
chkr.set_text(para)
for err in chkr:
print("ERROR", err.word)
if __name__ == '__main__':
content = "This is sme sample txt with erors."
check_paragraph(content)
ERROR sme
ERROR txt
ERROR erors
分词器(Tokenization)进行分词,返回结果格式(word, pos),其中pos是word在整个文本中出现的位置
from enchant.tokenize import get_tokenizer
def tokenize_text(text):
tknzr = get_tokenizer("en_US")
ret = [wd for wd in tknzr(text)]
print(ret)
if __name__ == '__main__':
content = "This is some sample text written by Soloman."
tokenize_text(content)
[('This', 0), ('is', 5), ('some', 8), ('sample', 13), ('text', 20), ('written', 25), ('by', 33), ('Soloman', 36)]
2 自建单词库
使用自己的单词库文件创建单词集合,重点在于找到一份可靠的 english_words.txt
with open("english_words.txt") as word_file:
english_words = set(word.strip().lower() for word in word_file)
def is_english_word(word):
return word.lower() in english_words
if __name__ == '__main__':
ret = is_english_word("hacker")
print(ret)
3 NLTK + Wordnet
from nltk.corpus import wordnet
if not wordnet.synsets(word_to_test):
#Not an English Word
else:
#English Word
4 有道词典API
# -*- coding: utf-8 -*-
import sys
import uuid
import requests
import hashlib
import time
from imp import reload
reload(sys)
YOUDAO_URL = 'https://openapi.youdao.com/api'
APP_KEY = '您的应用ID'
APP_SECRET = '您的应用密钥'
def encrypt(signStr):
hash_algorithm = hashlib.sha256()
hash_algorithm.update(signStr.encode('utf-8'))
return hash_algorithm.hexdigest()
def truncate(q):
if q is None:
return None
size = len(q)
return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size]
def do_request(data):
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
return requests.post(YOUDAO_URL, data=data, headers=headers)
def connect():
q = "待输入的文字"
data = {}
data['from'] = '源语言'
data['to'] = '目标语言'
data['signType'] = 'v3'
curtime = str(int(time.time()))
data['curtime'] = curtime
salt = str(uuid.uuid1())
signStr = APP_KEY + truncate(q) + salt + curtime + APP_SECRET
sign = encrypt(signStr)
data['appKey'] = APP_KEY
data['q'] = q
data['salt'] = salt
data['sign'] = sign
data['vocabId'] = "您的用户词表ID"
response = do_request(data)
contentType = response.headers['Content-Type']
if contentType == "audio/mp3":
millis = int(round(time.time() * 1000))
filePath = "合成的音频存储路径" + str(millis) + ".mp3"
fo = open(filePath, 'wb')
fo.write(response.content)
fo.close()
else:
print(response.content)
if __name__ == '__main__':
connect()