Cloud Defense Logo

Products

Solutions

Company

Book A Live Demo

Top 10 Examples of "gensim in functional component" in Python

Dive into secure and efficient coding practices with our curated list of the top 10 examples showcasing 'gensim' in functional components in Python. Our advanced machine learning engine meticulously scans each line of code, cross-referencing millions of open source libraries to ensure your implementation is not just functional, but also robust and secure. Elevate your React applications to new heights by mastering the art of handling side effects, API calls, and asynchronous operations with confidence and precision.

def test_doc2vec_inference_saveload():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
    model.save(TEST_FILE)
    del model
    model = Doc2Vec.load(TEST_FILE)
    os.remove(TEST_FILE)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1
def similarity_3_contexts(p, t):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector))

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector))

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector))

        return 0*bef + 1*bet + 0*aft
def make_index():
    logging.info('loading dictionary')
    dictionary = gensim.corpora.Dictionary.load_from_text('svd/dictionary.txt')
    logging.info('loading corpus')
    corpus = gensim.corpora.MmCorpus('svd/corpus.mm')
    tfidf = gensim.models.TfidfModel(corpus)
    logging.info('loading model')
    model = gensim.models.ldamodel.LdaModel.load('svd/lda.txt')
    logging.info('building lda docs')
    lda_corpus = model[tfidf[corpus]]
    logging.info('building index')
    index = gensim.similarities.docsim.Similarity('/tmp/lda_index.txt', lda_corpus, 1000)
    index.save('svd/lda_index.txt')
def train_word2vec_model(input: str, output_directory: str, model_name: str) -> None:

    if not os.access(output_directory, os.W_OK):
        print("Cannot write to directory {}. Exiting!".format(output_directory))
        exit(1)

    if os.path.isdir(input):
        sentences = gensim.models.word2vec.PathLineSentences(input)
    else:
        sentences = gensim.models.word2vec.LineSentence(input)

    model = gensim.models.Word2Vec(sentences, sg=0, size=100, window=10, min_count=20, workers=10)
    model.train(sentences, total_examples=model.corpus_count, epochs=10)
    model.save(output_directory + model_name)
    # We want the vectors only to reduce memory footprint: this is the file(s) that the oneline lexicon should use.
    vectors = model.wv
    vectors.save(output_directory + model_name + ".vectors-only")
else:
        params["scoreperclass"] = True
    if "word_norm" not in params.keys():
        params["word_norm"] = 1
    if "oov_random" not in params.keys():
        params["oov_random"] = 0
    if "emb_model" in params.keys():
        emb_models = []
        print("===> use pre-trained embeddings...")
        model_str = params["emb_model"].split(',')
        for m_s in model_str:
            gensimFormat = ".gensim" in m_s
            if gensimFormat:
                emb_models.append(gensim.models.KeyedVectors.load(m_s, mmap='r'))
            else:
                emb_models.append(gensim.models.KeyedVectors. \
                                  load_word2vec_format(m_s, binary=True))
        print("<===loaded {} models".format(len(emb_models)))
    if "emb_dim" in params.keys():
        emb_dim = int(params["emb_dim"])
    if "gpu" in params.keys():
        if params["gpu"] == "1":
            print("using gpu...")
        else:
            print("using cpu...")
    if "wdist" in params.keys():
        wdist_file = params["wdist"]
    else:
        wdist_file = None


    use_mixed_data=False
def load_embeddings_gensim(embeddings_config, label, vocabulary, save_to):
    # create a weight matrix for entities in training docs
    embedding_matrix = np.zeros((len(vocabulary), embeddings_config['dims']))
        
    # load embeddings binary model with gensim for word2vec and rdf2vec embeddings
    model = gensim.models.Word2Vec.load(embeddings_config['path'])
    #model = gensim.models.KeyedVectors.load_word2vec_format(embeddings_config['path'], binary=True)
    embedded_entities = model.wv
    missing = 0
    for entity, entity_id in vocabulary.items():
        # strip entity label format to rdf2vec label format
        #rdf2vec_entity_label = 'dbr:%s' % entity.split('/')[-1]
        #print rdf2vec_entity_label
        rdf2vec_entity_label = '<' + entity + '>'
        if rdf2vec_entity_label in embedded_entities:
            embedding_matrix[entity_id] = embedded_entities[rdf2vec_entity_label]
        else:
            missing += 1
    print "done loading gensim entities. %d missing" % missing
    # save embedding_matrix for entities in the training dataset
    np.save(save_to, embedding_matrix)
    # print embedding_matrix
print "third vocab"   

#st conc pt conc pd conc br conc mr vocab w/o pars
t3 = list()
for i in range(len(st)):
    p = st1[i].split()+pt1[i].split()+pd1[i].split()+br1[i].split()+mr1[i].split()+ab1[i].split()+at1[i].split()
    t3.append(p)

print "fourth vocab" 

#trin models
model0 = gensim.models.Word2Vec(t, sg=1, window=10, sample=1e-5, negative=5, size=300)
model1 = gensim.models.Word2Vec(t1, sg=1, window=10, sample=1e-5, negative=5, size=300)
model2 = gensim.models.Word2Vec(t2, sg=1, window=10, sample=1e-5, negative=5, size=300)
model3 = gensim.models.Word2Vec(t3, sg=1, window=10, sample=1e-5, negative=5, size=300)
#model4 = gensim.models.Word2Vec(t, sg=0,  hs=1, window=10,   size=300)
#model5 = gensim.models.Word2Vec(t1, sg=0, hs=1,window=10,   size=300)
#model6 = gensim.models.Word2Vec(t2, sg=0, hs=1, window=10,   size=300)
#model7 = gensim.models.Word2Vec(t3, sg=0, hs=1,window=10,   size=300)

print "model prepared"


#for each model calculate features^ n_similarity between st and something else
model_list=[model0,model1,model2,model3]   #,model4  ,model5,model6,model7]
n_sim=list()

for model in model_list:

    n_sim_pt=list()
    for i in range(len(st)):
CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX)
                                                                                            # @UndefinedVariable
        self.C = theano.shared(value=CNP, name='C')
        # self.C = theano.printing.Print("C = ")(self.C)

        # Selectional Preferences
        Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
        Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
        self.C1 = theano.shared(value=Ca1NP, name='C1')
        self.C2 = theano.shared(value=Ca2NP, name='C2')
        # argument embeddings
        ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX)  # @UndefinedVariable

        if ex_emb:
            import gensim
            external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)

            for idArg in xrange(self.a):
                arg = data.id2Arg[idArg].lower().split(' ')
                new = np.zeros(k, dtype=theano.config.floatX)
                size = 0
                for ar in arg:
                    if ar in external_embeddings:
                        new += external_embeddings[ar]
                        size += 1
                if size > 0:
                    ANP[idArg] = new/size

        self.A = theano.shared(value=ANP, name='A')  # (a1, k)

        self.Ab = theano.shared(value=np.zeros(a,  dtype=theano.config.floatX),  # @UndefinedVariable
                                 name='Ab', borrow=True)
argument = sys.argv[1]
filename = argument.split('/')[-1]

args = filename.split('.')[0].split('__')
(urlhash,algo,vectorsize,windowsize) = args

if algo == "skipgram":
    skipgram = 1
else:
    skipgram = 0

data = gensim.models.word2vec.LineSentence(argument)


model = gensim.models.Word2Vec(data, size=int(vectorsize), min_count=2, window=int(windowsize), sg=skipgram, workers=2, iter=5, cbow_mean=1)
model.init_sims(replace=True)
model.save_word2vec_format(root+'/trained/'+filename.split('.')[0].split('__')[0]+'.model', binary=True)
os.remove(root+'/tmp/'+filename.split('.')[0].split('__')[0])
import os

from collections import OrderedDict

import pandas as pd
from nlpia.data.loaders import get_data, BIGDATA_PATH
from gensim.models import KeyedVectors


word_vectors = get_data('word2vec')  # not in book

wordvector_path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz')    # not in book, reader required to compose this path

if 'word_vectors' not in globals():  # not in book
    WV = word_vectors = get_data('word2vec')
    word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)


###################################################
# Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors globals

COMPONENT_WORDS = OrderedDict([
    ('placeness', ('geography Geography geographic geographical geographical_location location ' +
                   'locale locations proximity').split()),
    ('peopleness', 'human Humans homo_sapiens peole people individuals humankind people men women'.split()),
    ('animalness', 'animal mammal carnivore animals Animal animal_welfare dog pet cats ani_mal'.split()),
    ('conceptness', 'concept concepts idea'.split()),
    ('femaleness', 'female Female females femal woman girl lady'.split()),
])


def component_vector(words):

Is your System Free of Underlying Vulnerabilities?
Find Out Now