|
1 | 1 | #!/usr/bin/env python |
2 | 2 | # -*- coding: utf-8 -*- |
| 3 | +# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz> |
3 | 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html |
4 | 5 |
|
5 | | -"""Automatically detect common phrases -- multi-word expressions / word n-grams -- from a stream of sentences. |
| 6 | +""" |
| 7 | +Automatically detect common phrases -- aka multi-word expressions, word n-gram collocations -- from |
| 8 | +a stream of sentences. |
6 | 9 |
|
7 | 10 | Inspired by: |
8 | 11 |
|
|
20 | 23 | >>> from gensim.models.word2vec import Text8Corpus |
21 | 24 | >>> from gensim.models.phrases import Phrases, Phraser |
22 | 25 | >>> |
| 26 | + >>> # Load training data. |
23 | 27 | >>> sentences = Text8Corpus(datapath('testcorpus.txt')) |
24 | | - >>> phrases = Phrases(sentences, min_count=1, threshold=1) # train model |
25 | | - >>> phrases[[u'trees', u'graph', u'minors']] # apply model to sentence |
26 | | - [u'trees_graph', u'minors'] |
| 28 | + >>> # The training corpus must be a sequence (stream, generator) of sentences, |
| 29 | + >>> # with each sentence a list of tokens: |
| 30 | + >>> print(list(sentences)[0][:10]) |
| 31 | + ['computer', 'human', 'interface', 'computer', 'response', 'survey', 'system', 'time', 'user', 'interface'] |
| 32 | + >>> |
| 33 | + >>> # Train a toy bigram model. |
| 34 | + >>> phrases = Phrases(sentences, min_count=1, threshold=1) |
| 35 | + >>> # Apply the trained phrases model to a new, unseen sentence. |
| 36 | + >>> phrases[['trees', 'graph', 'minors']] |
| 37 | + ['trees_graph', 'minors'] |
| 38 | + >>> # The toy model considered "trees graph" a single phrase => joined the two |
| 39 | + >>> # tokens into a single token, `trees_graph`. |
27 | 40 | >>> |
28 | | - >>> phrases.add_vocab([["hello", "world"], ["meow"]]) # update model with new sentences |
| 41 | + >>> # Update the model with two new sentences on the fly. |
| 42 | + >>> phrases.add_vocab([["hello", "world"], ["meow"]]) |
29 | 43 | >>> |
30 | | - >>> bigram = Phraser(phrases) # construct faster model (this is only an wrapper) |
31 | | - >>> bigram[[u'trees', u'graph', u'minors']] # apply model to sentence |
32 | | - [u'trees_graph', u'minors'] |
| 44 | + >>> # Export the trained model = use less RAM, faster processing. Model updates no longer possible. |
| 45 | + >>> bigram = Phraser(phrases) |
| 46 | + >>> bigram[['trees', 'graph', 'minors']] # apply the exported model to a sentence |
| 47 | + ['trees_graph', 'minors'] |
33 | 48 | >>> |
34 | | - >>> for sent in bigram[sentences]: # apply model to text corpus |
| 49 | + >>> # Apply the exported model to each sentence of a corpus: |
| 50 | + >>> for sent in bigram[sentences]: |
35 | 51 | ... pass |
| 52 | + >>> |
| 53 | + >>> # Save / load an exported collocation model. |
| 54 | + >>> bigram.save("/tmp/my_bigram_model.pkl") |
| 55 | + >>> bigram_reloaded = Phraser.load("/tmp/my_bigram_model.pkl") |
| 56 | + >>> bigram_reloaded[['trees', 'graph', 'minors']] # apply the exported model to a sentence |
| 57 | + ['trees_graph', 'minors'] |
36 | 58 |
|
37 | 59 | """ |
38 | 60 |
|
|
0 commit comments