|
| 1 | +--- |
| 2 | +layout: model |
| 3 | +title: BERT Sequence Classification - Identify Trec Data Classes |
| 4 | +author: John Snow Labs |
| 5 | +name: bert_sequence_classifier_trec_coarse |
| 6 | +date: 2021-11-06 |
| 7 | +tags: [bert_for_sequence_classification, trec, en, open_source] |
| 8 | +task: Text Classification |
| 9 | +language: en |
| 10 | +edition: Spark NLP 3.3.2 |
| 11 | +spark_version: 2.4 |
| 12 | +supported: true |
| 13 | +article_header: |
| 14 | + type: cover |
| 15 | +use_language_switcher: "Python-Scala-Java" |
| 16 | +--- |
| 17 | + |
| 18 | +## Description |
| 19 | + |
| 20 | +This model is imported from `Hugging Face-models` and it is a simple base BERT model trained on the "trec" dataset. |
| 21 | + |
| 22 | +## Predicted Entities |
| 23 | + |
| 24 | +`DESC`, `ENTY`, `HUM`, `NUM`, `ABBR`, `LOC` |
| 25 | + |
| 26 | +{:.btn-box} |
| 27 | +<button class="button button-orange" disabled>Live Demo</button> |
| 28 | +<button class="button button-orange" disabled>Open in Colab</button> |
| 29 | +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_sequence_classifier_trec_coarse_en_3.3.2_2.4_1636229841055.zip){:.button.button-orange.button-orange-trans.arr.button-icon} |
| 30 | + |
| 31 | +## How to use |
| 32 | + |
| 33 | + |
| 34 | + |
| 35 | +<div class="tabs-box" markdown="1"> |
| 36 | +{% include programmingLanguageSelectScalaPythonNLU.html %} |
| 37 | +```python |
| 38 | +document_assembler = DocumentAssembler() \ |
| 39 | + .setInputCol('text') \ |
| 40 | + .setOutputCol('document') |
| 41 | + |
| 42 | +tokenizer = Tokenizer() \ |
| 43 | + .setInputCols(['document']) \ |
| 44 | + .setOutputCol('token') |
| 45 | + |
| 46 | +sequenceClassifier = BertForSequenceClassification \ |
| 47 | + .pretrained('bert_sequence_classifier_trec_coarse', 'en') \ |
| 48 | + .setInputCols(['token', 'document']) \ |
| 49 | + .setOutputCol('class') \ |
| 50 | + .setCaseSensitive(True) \ |
| 51 | + .setMaxSentenceLength(512) |
| 52 | + |
| 53 | +pipeline = Pipeline(stages=[document_assembler, tokenizer, sequenceClassifier]) |
| 54 | + |
| 55 | +example = spark.createDataFrame([['Germany is the largest country in Europe economically.']]).toDF("text") |
| 56 | +result = pipeline.fit(example).transform(example) |
| 57 | +``` |
| 58 | +```scala |
| 59 | +val document_assembler = DocumentAssembler() |
| 60 | + .setInputCol("text") |
| 61 | + .setOutputCol("document") |
| 62 | + |
| 63 | +val tokenizer = Tokenizer() |
| 64 | + .setInputCols("document") |
| 65 | + .setOutputCol("token") |
| 66 | + |
| 67 | +val tokenClassifier = BertForSequenceClassification("bert_sequence_classifier_trec_coarse", "en") |
| 68 | + .setInputCols("document", "token") |
| 69 | + .setOutputCol("class") |
| 70 | + .setCaseSensitive(true) |
| 71 | + .setMaxSentenceLength(512) |
| 72 | + |
| 73 | +val pipeline = new Pipeline().setStages(Array(document_assembler, tokenizer, sequenceClassifier)) |
| 74 | + |
| 75 | +val example = Seq.empty["Germany is the largest country in Europe economically."].toDS.toDF("text") |
| 76 | + |
| 77 | +val result = pipeline.fit(example).transform(example) |
| 78 | +``` |
| 79 | +</div> |
| 80 | + |
| 81 | +## Results |
| 82 | + |
| 83 | +```bash |
| 84 | +['LOC'] |
| 85 | +``` |
| 86 | + |
| 87 | +{:.model-param} |
| 88 | +## Model Information |
| 89 | + |
| 90 | +{:.table-model} |
| 91 | +|---|---| |
| 92 | +|Model Name:|bert_sequence_classifier_trec_coarse| |
| 93 | +|Compatibility:|Spark NLP 3.3.2+| |
| 94 | +|License:|Open Source| |
| 95 | +|Edition:|Official| |
| 96 | +|Input Labels:|[token, sentence]| |
| 97 | +|Output Labels:|[label]| |
| 98 | +|Language:|en| |
| 99 | +|Case sensitive:|true| |
| 100 | + |
| 101 | +## Data Source |
| 102 | + |
| 103 | +[https://huggingface.co/aychang/bert-base-cased-trec-coarse](https://huggingface.co/aychang/bert-base-cased-trec-coarse) |
0 commit comments