50行Python代码写一个语言检测器
- - ITeye资讯频道你有没有曾经好奇过Chrome浏览器是如何知道一个网页的语言,并对外国文字的网页提供翻译服务的. 或者,Facebook是如何翻译你朋友用写在你主页上的外国文字. 检测一种语言实际上非常简单,改进了用户体验,而且不需要用户做任何的事情. 我无意中发现的 ActiveState recipe for a language detector in Python这是非常不错的一段程序,但是我决定做点小小的改进.
class NGram(object):
def __init__(self, text, n=3):
self.length = None
self.n = n
self.table = {}
self.parse_text(text)
def parse_text(self, text):
chars = ' ' * self.n # initial sequence of spaces with length n
for letter in (" ".join(text.split()) + " "):
chars = chars[1:] + letter # append letter to sequence of length n
self.table[chars] = self.table.get(chars, 0) + 1 # increment count
{
' S': 1,
' Sn': 1,
'Sna': 1,
'nai': 1,
'ail': 2,
'il ': 1,
'l M': 1,
' Ma': 1,
'Mai': 1,
'il.': 1
}
class NGram(object):
def __init__(self, text, n=3):
self.length = None
self.n = n
self.table = {}
self.parse_text(text)
self.calculate_length()
def parse_text(self, text):
chars = ' ' * self.n # initial sequence of spaces with length n
for letter in (" ".join(text.split()) + " "):
chars = chars[1:] + letter # append letter to sequence of length n
self.table[chars] = self.table.get(chars, 0) + 1 # increment count
def calculate_length(self):
""" Treat the N-Gram table as a vector and return its scalar magnitude
to be used for performing a vector-based search.
"""
self.length = sum([x * x for x in self.table.values()]) ** 0.5
return self.length
def __sub__(self, other):
""" Find the difference between two NGram objects by finding the cosine
of the angle between the two vector representations of the table of
N-Grams. Return a float value between 0 and 1 where 0 indicates that
the two NGrams are exactly the same.
"""
if not isinstance(other, NGram):
raise TypeError("Can't compare NGram with non-NGram object.")
if self.n != other.n:
raise TypeError("Can't compare NGram objects of different size.")
total = 0
for k in self.table:
total += self.table[k] * other.table.get(k, 0)
return 1.0 - (float(total) )/ (float(self.length) * float(other.length))
def find_match(self, languages):
""" Out of a list of NGrams that represent individual languages, return
the best match.
"""
return min(languages, lambda n: self - n)
english = NGram(training_text, n=3) #trigram
similarity = english - NGram(text, n=3)
languages = [english, spanish, french] NGram(text, n=3).best_match(languages)