|
| 1 | +--- |
| 2 | +applies_to: |
| 3 | + stack: all |
| 4 | +navigation_title: Term vectors API examples |
| 5 | +--- |
| 6 | +# Term vectors API examples |
| 7 | + |
| 8 | +[Term vectors](/reference/elasticsearch/mapping-reference/term-vector.md) provide information about the terms that were produced by the analysis process, including term frequencies, positions, offsets, and payloads. They're useful for applications like highlighting, more-like-this queries, and text analysis. |
| 9 | + |
| 10 | +This page shows you examples of using the [term vectors API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-termvectors). |
| 11 | + |
| 12 | +## Returning stored term vectors [docs-termvectors-api-stored-termvectors] |
| 13 | + |
| 14 | +First, create an index that stores term vectors, payloads, and so on: |
| 15 | + |
| 16 | +```console |
| 17 | +PUT /my-index-000001 |
| 18 | +{ "mappings": { |
| 19 | + "properties": { |
| 20 | + "text": { |
| 21 | + "type": "text", |
| 22 | + "term_vector": "with_positions_offsets_payloads", |
| 23 | + "store" : true, |
| 24 | + "analyzer" : "fulltext_analyzer" |
| 25 | + }, |
| 26 | + "fullname": { |
| 27 | + "type": "text", |
| 28 | + "term_vector": "with_positions_offsets_payloads", |
| 29 | + "analyzer" : "fulltext_analyzer" |
| 30 | + } |
| 31 | + } |
| 32 | + }, |
| 33 | + "settings" : { |
| 34 | + "index" : { |
| 35 | + "number_of_shards" : 1, |
| 36 | + "number_of_replicas" : 0 |
| 37 | + }, |
| 38 | + "analysis": { |
| 39 | + "analyzer": { |
| 40 | + "fulltext_analyzer": { |
| 41 | + "type": "custom", |
| 42 | + "tokenizer": "whitespace", |
| 43 | + "filter": [ |
| 44 | + "lowercase", |
| 45 | + "type_as_payload" |
| 46 | + ] |
| 47 | + } |
| 48 | + } |
| 49 | + } |
| 50 | + } |
| 51 | +} |
| 52 | +``` |
| 53 | + |
| 54 | +Add some documents: |
| 55 | + |
| 56 | +```console |
| 57 | +PUT /my-index-000001/_doc/1 |
| 58 | +{ |
| 59 | + "fullname" : "John Doe", |
| 60 | + "text" : "test test test " |
| 61 | +} |
| 62 | + |
| 63 | +PUT /my-index-000001/_doc/2?refresh=wait_for |
| 64 | +{ |
| 65 | + "fullname" : "Jane Doe", |
| 66 | + "text" : "Another test ..." |
| 67 | +} |
| 68 | +``` |
| 69 | + |
| 70 | +% TEST[continued] |
| 71 | + |
| 72 | +The following request returns all information and statistics for field |
| 73 | +`text` in document `1` (John Doe): |
| 74 | + |
| 75 | +```console |
| 76 | +GET /my-index-000001/_termvectors/1 |
| 77 | +{ |
| 78 | + "fields" : ["text"], |
| 79 | + "offsets" : true, |
| 80 | + "payloads" : true, |
| 81 | + "positions" : true, |
| 82 | + "term_statistics" : true, |
| 83 | + "field_statistics" : true |
| 84 | +} |
| 85 | +``` |
| 86 | + |
| 87 | +% TEST[continued] |
| 88 | + |
| 89 | +Response: |
| 90 | + |
| 91 | +```console-result |
| 92 | +{ |
| 93 | + "_index": "my-index-000001", |
| 94 | + "_id": "1", |
| 95 | + "_version": 1, |
| 96 | + "found": true, |
| 97 | + "took": 6, |
| 98 | + "term_vectors": { |
| 99 | + "text": { |
| 100 | + "field_statistics": { |
| 101 | + "sum_doc_freq": 4, |
| 102 | + "doc_count": 2, |
| 103 | + "sum_ttf": 6 |
| 104 | + }, |
| 105 | + "terms": { |
| 106 | + "test": { |
| 107 | + "doc_freq": 2, |
| 108 | + "ttf": 4, |
| 109 | + "term_freq": 3, |
| 110 | + "tokens": [ |
| 111 | + { |
| 112 | + "position": 0, |
| 113 | + "start_offset": 0, |
| 114 | + "end_offset": 4, |
| 115 | + "payload": "d29yZA==" |
| 116 | + }, |
| 117 | + { |
| 118 | + "position": 1, |
| 119 | + "start_offset": 5, |
| 120 | + "end_offset": 9, |
| 121 | + "payload": "d29yZA==" |
| 122 | + }, |
| 123 | + { |
| 124 | + "position": 2, |
| 125 | + "start_offset": 10, |
| 126 | + "end_offset": 14, |
| 127 | + "payload": "d29yZA==" |
| 128 | + } |
| 129 | + ] |
| 130 | + } |
| 131 | + } |
| 132 | + } |
| 133 | + } |
| 134 | +} |
| 135 | +``` |
| 136 | + |
| 137 | +% TEST[continued] |
| 138 | +% TESTRESPONSE[s/"took": 6/"took": "$body.took"/] |
| 139 | + |
| 140 | +## Generating term vectors on the fly [docs-termvectors-api-generate-termvectors] |
| 141 | + |
| 142 | +Term vectors which are not explicitly stored in the index are automatically |
| 143 | +computed on the fly. The following request returns all information and statistics for the |
| 144 | +fields in document `1`, even though the terms haven't been explicitly stored in the index. |
| 145 | +Note that for the field `text`, the terms are not re-generated. |
| 146 | + |
| 147 | +```console |
| 148 | +GET /my-index-000001/_termvectors/1 |
| 149 | +{ |
| 150 | + "fields" : ["text", "some_field_without_term_vectors"], |
| 151 | + "offsets" : true, |
| 152 | + "positions" : true, |
| 153 | + "term_statistics" : true, |
| 154 | + "field_statistics" : true |
| 155 | +} |
| 156 | +``` |
| 157 | + |
| 158 | +% TEST[continued] |
| 159 | + |
| 160 | +## Artificial documents [docs-termvectors-artificial-doc] |
| 161 | + |
| 162 | +Term vectors can also be generated for artificial documents, |
| 163 | +that is for documents not present in the index. For example, the following request would |
| 164 | +return the same results as in example 1. The mapping used is determined by the `index`. |
| 165 | + |
| 166 | +*If dynamic mapping is turned on (default), the document fields not in the original mapping will be dynamically created.* |
| 167 | + |
| 168 | +```console |
| 169 | +GET /my-index-000001/_termvectors |
| 170 | +{ |
| 171 | + "doc" : { |
| 172 | + "fullname" : "John Doe", |
| 173 | + "text" : "test test test" |
| 174 | + } |
| 175 | +} |
| 176 | +``` |
| 177 | + |
| 178 | +% TEST[continued] |
| 179 | + |
| 180 | +## Per-field analyzer [docs-termvectors-per-field-analyzer] |
| 181 | + |
| 182 | +Additionally, a different analyzer than the one at the field may be provided |
| 183 | +by using the `per_field_analyzer` parameter. This is useful in order to |
| 184 | +generate term vectors in any fashion, especially when using artificial |
| 185 | +documents. When providing an analyzer for a field that already stores term |
| 186 | +vectors, the term vectors will be re-generated. |
| 187 | + |
| 188 | +```console |
| 189 | +GET /my-index-000001/_termvectors |
| 190 | +{ |
| 191 | + "doc" : { |
| 192 | + "fullname" : "John Doe", |
| 193 | + "text" : "test test test" |
| 194 | + }, |
| 195 | + "fields": ["fullname"], |
| 196 | + "per_field_analyzer" : { |
| 197 | + "fullname": "keyword" |
| 198 | + } |
| 199 | +} |
| 200 | +``` |
| 201 | + |
| 202 | +% TEST[continued] |
| 203 | + |
| 204 | +Response: |
| 205 | + |
| 206 | +```console-result |
| 207 | +{ |
| 208 | + "_index": "my-index-000001", |
| 209 | + "_version": 0, |
| 210 | + "found": true, |
| 211 | + "took": 6, |
| 212 | + "term_vectors": { |
| 213 | + "fullname": { |
| 214 | + "field_statistics": { |
| 215 | + "sum_doc_freq": 2, |
| 216 | + "doc_count": 4, |
| 217 | + "sum_ttf": 4 |
| 218 | + }, |
| 219 | + "terms": { |
| 220 | + "John Doe": { |
| 221 | + "term_freq": 1, |
| 222 | + "tokens": [ |
| 223 | + { |
| 224 | + "position": 0, |
| 225 | + "start_offset": 0, |
| 226 | + "end_offset": 8 |
| 227 | + } |
| 228 | + ] |
| 229 | + } |
| 230 | + } |
| 231 | + } |
| 232 | + } |
| 233 | +} |
| 234 | +``` |
| 235 | + |
| 236 | +% TEST[continued] |
| 237 | +% TESTRESPONSE[s/"took": 6/"took": "$body.took"/] |
| 238 | +% TESTRESPONSE[s/"sum_doc_freq": 2/"sum_doc_freq": "$body.term_vectors.fullname.field_statistics.sum_doc_freq"/] |
| 239 | +% TESTRESPONSE[s/"doc_count": 4/"doc_count": "$body.term_vectors.fullname.field_statistics.doc_count"/] |
| 240 | +% TESTRESPONSE[s/"sum_ttf": 4/"sum_ttf": "$body.term_vectors.fullname.field_statistics.sum_ttf"/] |
| 241 | + |
| 242 | +## Terms filtering [docs-termvectors-terms-filtering] |
| 243 | + |
| 244 | +Finally, the terms returned could be filtered based on their tf-idf scores. In |
| 245 | +the example below we obtain the three most "interesting" keywords from the |
| 246 | +artificial document having the given "plot" field value. Notice |
| 247 | +that the keyword "Tony" or any stop words are not part of the response, as |
| 248 | +their tf-idf must be too low. |
| 249 | + |
| 250 | +```console |
| 251 | +GET /imdb/_termvectors |
| 252 | +{ |
| 253 | + "doc": { |
| 254 | + "plot": "When wealthy industrialist Tony Stark is forced to build an armored suit after a life-threatening incident, he ultimately decides to use its technology to fight against evil." |
| 255 | + }, |
| 256 | + "term_statistics": true, |
| 257 | + "field_statistics": true, |
| 258 | + "positions": false, |
| 259 | + "offsets": false, |
| 260 | + "filter": { |
| 261 | + "max_num_terms": 3, |
| 262 | + "min_term_freq": 1, |
| 263 | + "min_doc_freq": 1 |
| 264 | + } |
| 265 | +} |
| 266 | +``` |
| 267 | + |
| 268 | +% TEST[skip:no imdb test index] |
| 269 | + |
| 270 | +Response: |
| 271 | + |
| 272 | +```console-result |
| 273 | +{ |
| 274 | + "_index": "imdb", |
| 275 | + "_version": 0, |
| 276 | + "found": true, |
| 277 | + "term_vectors": { |
| 278 | + "plot": { |
| 279 | + "field_statistics": { |
| 280 | + "sum_doc_freq": 3384269, |
| 281 | + "doc_count": 176214, |
| 282 | + "sum_ttf": 3753460 |
| 283 | + }, |
| 284 | + "terms": { |
| 285 | + "armored": { |
| 286 | + "doc_freq": 27, |
| 287 | + "ttf": 27, |
| 288 | + "term_freq": 1, |
| 289 | + "score": 9.74725 |
| 290 | + }, |
| 291 | + "industrialist": { |
| 292 | + "doc_freq": 88, |
| 293 | + "ttf": 88, |
| 294 | + "term_freq": 1, |
| 295 | + "score": 8.590818 |
| 296 | + }, |
| 297 | + "stark": { |
| 298 | + "doc_freq": 44, |
| 299 | + "ttf": 47, |
| 300 | + "term_freq": 1, |
| 301 | + "score": 9.272792 |
| 302 | + } |
| 303 | + } |
| 304 | + } |
| 305 | + } |
| 306 | +} |
| 307 | +``` |
0 commit comments