Dive into secure and efficient coding practices with our curated list of the top 10 examples showcasing 'gensim' in functional components in Python. Our advanced machine learning engine meticulously scans each line of code, cross-referencing millions of open source libraries to ensure your implementation is not just functional, but also robust and secure. Elevate your React applications to new heights by mastering the art of handling side effects, API calls, and asynchronous operations with confidence and precision.
def test_doc2vec_inference_saveload():
tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
model.save(TEST_FILE)
del model
model = Doc2Vec.load(TEST_FILE)
os.remove(TEST_FILE)
d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
match_op = Matching()
retrieval = Retrieval(d2v, matching=match_op).fit(documents)
result = retrieval.query("scientists")
assert result[0] == 1
def similarity_3_contexts(p, t):
(bef, bet, aft) = (0, 0, 0)
if t.bef_vector is not None and p.bef_vector is not None:
bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector))
if t.bet_vector is not None and p.bet_vector is not None:
bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector))
if t.aft_vector is not None and p.aft_vector is not None:
aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector))
return 0*bef + 1*bet + 0*aft
def make_index():
logging.info('loading dictionary')
dictionary = gensim.corpora.Dictionary.load_from_text('svd/dictionary.txt')
logging.info('loading corpus')
corpus = gensim.corpora.MmCorpus('svd/corpus.mm')
tfidf = gensim.models.TfidfModel(corpus)
logging.info('loading model')
model = gensim.models.ldamodel.LdaModel.load('svd/lda.txt')
logging.info('building lda docs')
lda_corpus = model[tfidf[corpus]]
logging.info('building index')
index = gensim.similarities.docsim.Similarity('/tmp/lda_index.txt', lda_corpus, 1000)
index.save('svd/lda_index.txt')
def train_word2vec_model(input: str, output_directory: str, model_name: str) -> None:
if not os.access(output_directory, os.W_OK):
print("Cannot write to directory {}. Exiting!".format(output_directory))
exit(1)
if os.path.isdir(input):
sentences = gensim.models.word2vec.PathLineSentences(input)
else:
sentences = gensim.models.word2vec.LineSentence(input)
model = gensim.models.Word2Vec(sentences, sg=0, size=100, window=10, min_count=20, workers=10)
model.train(sentences, total_examples=model.corpus_count, epochs=10)
model.save(output_directory + model_name)
# We want the vectors only to reduce memory footprint: this is the file(s) that the oneline lexicon should use.
vectors = model.wv
vectors.save(output_directory + model_name + ".vectors-only")
else:
params["scoreperclass"] = True
if "word_norm" not in params.keys():
params["word_norm"] = 1
if "oov_random" not in params.keys():
params["oov_random"] = 0
if "emb_model" in params.keys():
emb_models = []
print("===> use pre-trained embeddings...")
model_str = params["emb_model"].split(',')
for m_s in model_str:
gensimFormat = ".gensim" in m_s
if gensimFormat:
emb_models.append(gensim.models.KeyedVectors.load(m_s, mmap='r'))
else:
emb_models.append(gensim.models.KeyedVectors. \
load_word2vec_format(m_s, binary=True))
print("<===loaded {} models".format(len(emb_models)))
if "emb_dim" in params.keys():
emb_dim = int(params["emb_dim"])
if "gpu" in params.keys():
if params["gpu"] == "1":
print("using gpu...")
else:
print("using cpu...")
if "wdist" in params.keys():
wdist_file = params["wdist"]
else:
wdist_file = None
use_mixed_data=False
def load_embeddings_gensim(embeddings_config, label, vocabulary, save_to):
# create a weight matrix for entities in training docs
embedding_matrix = np.zeros((len(vocabulary), embeddings_config['dims']))
# load embeddings binary model with gensim for word2vec and rdf2vec embeddings
model = gensim.models.Word2Vec.load(embeddings_config['path'])
#model = gensim.models.KeyedVectors.load_word2vec_format(embeddings_config['path'], binary=True)
embedded_entities = model.wv
missing = 0
for entity, entity_id in vocabulary.items():
# strip entity label format to rdf2vec label format
#rdf2vec_entity_label = 'dbr:%s' % entity.split('/')[-1]
#print rdf2vec_entity_label
rdf2vec_entity_label = '<' + entity + '>'
if rdf2vec_entity_label in embedded_entities:
embedding_matrix[entity_id] = embedded_entities[rdf2vec_entity_label]
else:
missing += 1
print "done loading gensim entities. %d missing" % missing
# save embedding_matrix for entities in the training dataset
np.save(save_to, embedding_matrix)
# print embedding_matrix
print "third vocab"
#st conc pt conc pd conc br conc mr vocab w/o pars
t3 = list()
for i in range(len(st)):
p = st1[i].split()+pt1[i].split()+pd1[i].split()+br1[i].split()+mr1[i].split()+ab1[i].split()+at1[i].split()
t3.append(p)
print "fourth vocab"
#trin models
model0 = gensim.models.Word2Vec(t, sg=1, window=10, sample=1e-5, negative=5, size=300)
model1 = gensim.models.Word2Vec(t1, sg=1, window=10, sample=1e-5, negative=5, size=300)
model2 = gensim.models.Word2Vec(t2, sg=1, window=10, sample=1e-5, negative=5, size=300)
model3 = gensim.models.Word2Vec(t3, sg=1, window=10, sample=1e-5, negative=5, size=300)
#model4 = gensim.models.Word2Vec(t, sg=0, hs=1, window=10, size=300)
#model5 = gensim.models.Word2Vec(t1, sg=0, hs=1,window=10, size=300)
#model6 = gensim.models.Word2Vec(t2, sg=0, hs=1, window=10, size=300)
#model7 = gensim.models.Word2Vec(t3, sg=0, hs=1,window=10, size=300)
print "model prepared"
#for each model calculate features^ n_similarity between st and something else
model_list=[model0,model1,model2,model3] #,model4 ,model5,model6,model7]
n_sim=list()
for model in model_list:
n_sim_pt=list()
for i in range(len(st)):
CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX)
# @UndefinedVariable
self.C = theano.shared(value=CNP, name='C')
# self.C = theano.printing.Print("C = ")(self.C)
# Selectional Preferences
Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
self.C1 = theano.shared(value=Ca1NP, name='C1')
self.C2 = theano.shared(value=Ca2NP, name='C2')
# argument embeddings
ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX) # @UndefinedVariable
if ex_emb:
import gensim
external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)
for idArg in xrange(self.a):
arg = data.id2Arg[idArg].lower().split(' ')
new = np.zeros(k, dtype=theano.config.floatX)
size = 0
for ar in arg:
if ar in external_embeddings:
new += external_embeddings[ar]
size += 1
if size > 0:
ANP[idArg] = new/size
self.A = theano.shared(value=ANP, name='A') # (a1, k)
self.Ab = theano.shared(value=np.zeros(a, dtype=theano.config.floatX), # @UndefinedVariable
name='Ab', borrow=True)
argument = sys.argv[1]
filename = argument.split('/')[-1]
args = filename.split('.')[0].split('__')
(urlhash,algo,vectorsize,windowsize) = args
if algo == "skipgram":
skipgram = 1
else:
skipgram = 0
data = gensim.models.word2vec.LineSentence(argument)
model = gensim.models.Word2Vec(data, size=int(vectorsize), min_count=2, window=int(windowsize), sg=skipgram, workers=2, iter=5, cbow_mean=1)
model.init_sims(replace=True)
model.save_word2vec_format(root+'/trained/'+filename.split('.')[0].split('__')[0]+'.model', binary=True)
os.remove(root+'/tmp/'+filename.split('.')[0].split('__')[0])
import os
from collections import OrderedDict
import pandas as pd
from nlpia.data.loaders import get_data, BIGDATA_PATH
from gensim.models import KeyedVectors
word_vectors = get_data('word2vec') # not in book
wordvector_path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz') # not in book, reader required to compose this path
if 'word_vectors' not in globals(): # not in book
WV = word_vectors = get_data('word2vec')
word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)
###################################################
# Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors globals
COMPONENT_WORDS = OrderedDict([
('placeness', ('geography Geography geographic geographical geographical_location location ' +
'locale locations proximity').split()),
('peopleness', 'human Humans homo_sapiens peole people individuals humankind people men women'.split()),
('animalness', 'animal mammal carnivore animals Animal animal_welfare dog pet cats ani_mal'.split()),
('conceptness', 'concept concepts idea'.split()),
('femaleness', 'female Female females femal woman girl lady'.split()),
])
def component_vector(words):