Dive into secure and efficient coding practices with our curated list of the top 10 examples showcasing 'nltk' in functional components in Python. Our advanced machine learning engine meticulously scans each line of code, cross-referencing millions of open source libraries to ensure your implementation is not just functional, but also robust and secure. Elevate your React applications to new heights by mastering the art of handling side effects, API calls, and asynchronous operations with confidence and precision.
def convert_to_wordnet_pos(senseval_pos):
if senseval_pos == 'VERB':
return wn.VERB
elif senseval_pos == 'NOUN':
return wn.NOUN
elif senseval_pos == 'ADV':
return wn.ADV
elif senseval_pos == 'ADJ':
return wn.ADJ
else:
return None
def test_subtrees_for_phrase(self):
t = self._sentence.subtrees_for_phrase("NP")[0]
self.assertIsInstance(t, Tree)
self.assertEquals("property", t[-1].leaves()[0])
def preprocess(text):
text = text.encode('utf-8').decode("ascii", "replace").replace(u"\ufffd", "_").replace("___", "'").replace("'s", " ").replace("``", " ").replace("''", " ").replace("_", " ").replace("'"," ").replace("`"," ")
text = re.sub("[^0-9a-zA-Z !\"/:;<=>?.,!@#$%^&-_|()']+", " ", text)
tokens = text.split(" ")
result = ""
for token in tokens:
word = token.split(" ")[0]
if word not in stopwords.words('english') and token not in punctuations and token not in hoax_stopwords:
if len(word) > 0:
if word.isupper() and dictionary.check(word.lower()):
new_token = lemmatizer.lemmatize(token.lower())
if new_token == token.lower():
new_token = lemmatizer.lemmatize(token.lower(), pos='v')
result += new_token + " "
elif word.isupper():
result += token.title() + " "
elif dictionary.check(word.lower()):
new_token = lemmatizer.lemmatize(token.lower())
if new_token == token.lower():
new_token = lemmatizer.lemmatize(token.lower(), pos='v')
result += new_token + " "
else:
result += token + " "
else:
def create_dic(self, documents):
texts = [[word for word in document.lower().split() if word not in stopwords.words('english')]
for document in documents]
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
return [dictionary, corpus]
def create_word_features(self, words):
# print words
w = []
for line in words:
for wrd in line.split():
w.append(wrd)
useful_words = [word for word in w if word not in
stopwords.words('english')]
my_dict = ' '.join([word for word in useful_words])
# print my_dict
return my_dict
def cluster_texts(texts, clusters=3):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
stop_words=stopwords.words('english'),
max_df=1.0,
min_df=1,
lowercase=True)
tfidf_model = vectorizer.fit_transform(texts)
km_model = KMeans(n_clusters=clusters, n_init=100, verbose=0, tol=1e-10)
km_model.fit(tfidf_model)
#print 'inertia: ', km_model.inertia_
#pdb.set_trace()
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
return clustering
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
# Clean the text, with the option to remove stopwords and to stem words.
# Convert words to lower case and split them
text = text.lower().split()
# Optionally, remove stop words
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
# Clean the text
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
def isStopword(text):
filtered_words = []
splitsent = text.split(' ')
for w in splitsent:
if w in stopwords.words('english'):
return 'Y'
else:
return 'N'
def preprocess(text):
"""
Preprocess text for encoder
"""
X = []
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
console.log("Loaded NLTK data")
for t in text:
sents = sent_detector.tokenize(t)
result = ''
for s in sents:
tokens = word_tokenize(s)
result += ' ' + ' '.join(tokens)
X.append(result)
return X
def _build_para_dict(self):
path = "data/ppdb-2.0-s-all"
lines = read_lines(path)
relations = [line.split(" ||| ")[-1] for line in lines]
equivalent_pairs = []
print("Preprocessing raw data...")
for line in tqdm(lines):
split = line.split(" ||| ")
if split[-1] == "Equivalence":
equivalent_pairs.append(tuple(split[1:3]))
paraphrase_pairs = [line.split(" ||| ")[1:3] for line in lines]
equivalent_pairs_ubuntu = []
print("Extracting paraphrase pairs...")
for pair in tqdm(equivalent_pairs):
tokens_0 = word_tokenize(pair[0])
tokens_1 = word_tokenize(pair[1])
if not (self._contains_unknown(tokens_0) or self._contains_unknown(tokens_1)):
equivalent_pairs_ubuntu.append(
(tokens_0, tokens_1))
# Insert paraphrases in both directions
print("Building dictionary...")
self.paraphrase_dict = {}
for (p0, p1) in tqdm(equivalent_pairs_ubuntu):
p0 = tuple(p0)
p1 = tuple(p1)
try:
self.paraphrase_dict[p0] = self.paraphrase_dict[p0] + [p1]
except:
self.paraphrase_dict[p0] = [p1]