JohnSnowLabs · maziyarpanahi · Jun 22, 2021 · Jun 18, 2021
diff --git a/python/sparknlp/training.py b/python/sparknlp/training.py
@@ -80,11 +80,11 @@ class PubTator(ExtendedJavaWrapper):
     def __init__(self):
         super(PubTator, self).__init__("com.johnsnowlabs.nlp.training.PubTator")
 
-    def readDataset(self, spark, path):
+    def readDataset(self, spark, path,ispadding=True):
 
         # ToDo Replace with std pyspark
         jSession = spark._jsparkSession
 
-        jdf = self._java_obj.readDataset(jSession, path)
+        jdf = self._java_obj.readDataset(jSession, path,ispadding)
         return DataFrame(jdf, spark._wrapped)
 
diff --git a/src/main/scala/com/johnsnowlabs/nlp/training/PubTator.scala b/src/main/scala/com/johnsnowlabs/nlp/training/PubTator.scala
@@ -8,9 +8,9 @@ import org.apache.spark.sql.types._
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 
 
-object PubTator {
+case class PubTator() {
 
-  def readDataset(spark: SparkSession, path: String): DataFrame = {
+  def readDataset(spark: SparkSession, path: String,isPaddedToken: Boolean = true): DataFrame = {
     val pubtator = spark.sparkContext.textFile(path)
     val titles = pubtator.filter(x => x.contains("|a|") | x.contains("|t|"))
     val titlesText = titles.map(x => x.split("\\|")).groupBy(_.head)
@@ -37,16 +37,18 @@ object PubTator {
       val tokenAnnotations = tokens.map(Annotation(_))
       val labelAnnotations = chunkLabels.map(Annotation(_))
       tokenAnnotations.map(ta => {
-        val tokenLabel = labelAnnotations.filter(la => la.begin <= ta.begin && la.end >= ta.end).headOption
+        val tokenLabel = labelAnnotations.find(la => la.begin <= ta.begin && la.end >= ta.end)
         val tokenTag = {
           if (tokenLabel.isEmpty) "O"
           else {
-            val tokenCSV = tokenLabel.get.metadata.get("entity").get
+            val tokenCSV = tokenLabel.get.metadata("entity")
             if (tokenCSV == "UnknownType") "O"
             else {
               val tokenPrefix = if (ta.begin == tokenLabel.get.begin) "B-" else "I-"
-              val paddedTokenTag = "T" + "%03d".format(tokenCSV.split(",")(0).slice(1, 4).toInt)
-              tokenPrefix + paddedTokenTag
+              val token = if (isPaddedToken) {
+                "T" + "%03d".format(tokenCSV.split(",")(0).slice(1, 4).toInt)
+              } else tokenCSV
+              tokenPrefix + token
             }
           }
         }

diff --git a/src/test/resources/corpus_pubtator_not_padding.txt b/src/test/resources/corpus_pubtator_not_padding.txt
@@ -0,0 +1,19 @@
+22016685|t|A novel missense mutation Asp506Gly in Exon 13 of the F11 gene in an asymptomatic Korean woman with mild factor XI deficiency.
+22016685|a|Factor XI (FXI) deficiency is a rare autosomal recessive coagulation disorder most commonly found in Ashkenazi and Iraqi Jews, but it is also found in other ethnic groups. It is a trauma or surgery-related bleeding disorder, but spontaneous bleeding is rarely seen. The clinical manifestation of bleeding in FXI deficiency cases is variable and seems to poorly correlate with plasma FXI levels. The molecular pathology of FXI deficiency is mutation in the F11 gene on the chromosome band 4q35. We report a novel mutation of the F11 gene in an 18-year-old asymptomatic Korean woman with mild FXI deficiency. Pre-operative laboratory screen tests for lipoma on her back revealed slightly prolonged activated partial thromboplastin time (45.2 sec; reference range, 23.2-39.4 sec). Her FXI activity (35%) was slightly lower than the normal FXI activity (reference range, 50-150%). Direct sequence analysis of the F11 gene revealed a heterozygous A to G substitution in nucleotide 1517 (c.1517A>G) of exon 13, resulting in the substitution of aspartic acid with glycine in codon 506 (p.Asp506Gly). To the best of our knowledge, the Asp506Gly is a novel missense mutation, and this is the first genetically confirmed case of mild FXI deficiency in Korea.
+22016685	26	35	Asp506Gly	ProteinMutation	p|SUB|D|506|G
+22016685	1109	1118	c.1517A>G	DNAMutation	c|SUB|A|1517|G
+22016685	1206	1217	p.Asp506Gly	ProteinMutation	p|SUB|D|506|G
+22016685	1254	1263	Asp506Gly	ProteinMutation	p|SUB|D|506|G
+
+21850008|t|Mutations in mitochondrially encoded complex I enzyme as the second common cause in a cohort of Chinese patients with mitochondrial myopathy, encephalopathy, lactic acidosis and stroke-like episodes.
+21850008|a|The mutation pattern of mitochondrial DNA (mtDNA) in mainland Chinese patients with mitochondrial myopathy, encephalopathy, lactic acidosis and stroke-like episodes (MELAS) has been rarely reported, though previous data suggested that the mutation pattern of MELAS could be different among geographically localized populations. We presented the results of comprehensive mtDNA mutation analysis in 92 unrelated Chinese patients with MELAS (85 with classic MELAS and 7 with MELAS/Leigh syndrome (LS) overlap syndrome). The mtDNA A3243G mutation was the most common causal genotype in this patient group (79/92 and 85.9%). The second common gene mutation was G13513A (7/92 and 7.6%). Additionally, we identified T10191C (p.S45P) in ND3, A11470C (p. K237N) in ND4, T13046C (p.M237T) in ND5 and a large-scale deletion (13025-13033:14417-14425) involving partial ND5 and ND6 subunits of complex I in one patient each. Among them, A11470C, T13046C and the single deletion were novel mutations. In summary, patients with mutations affecting mitochondrially encoded complex I (MTND) reached 12.0% (11/92) in this group. It is noteworthy that all seven patients with MELAS/LS overlap syndrome were associated with MTND mutations. Our data emphasize the important role of MTND mutations in the pathogenicity of MELAS, especially MELAS/LS overlap syndrome.
+21850008	727	733	A3243G	DNAMutation	m|SUB|A|3243|G
+21850008	856	863	G13513A	DNAMutation	m|SUB|G|13513|A
+21850008	909	916	T10191C	DNAMutation	m|SUB|T|10191|C
+21850008	918	924	p.S45P	ProteinMutation	p|SUB|S|45|P
+21850008	934	941	A11470C	DNAMutation	m|SUB|A|11470|C
+21850008	943	951	p. K237N	ProteinMutation	p|SUB|K|237|N
+21850008	961	968	T13046C	DNAMutation	m|SUB|T|13046|C
+21850008	970	977	p.M237T	ProteinMutation	p|SUB|M|237|T
+21850008	1124	1131	A11470C	DNAMutation	m|SUB|A|11470|C
+21850008	1133	1140	T13046C	DNAMutation	m|SUB|T|13046|C
diff --git a/src/test/scala/com/johnsnowlabs/nlp/util/PubTatorTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/util/PubTatorTestSpec.scala
@@ -17,10 +17,21 @@ class PubTatorTestSpec extends FlatSpec{
     //remove file if it's already there
     val directory = new Directory(new File("./pubtator-conll-test"))
     directory.deleteRecursively()
-    val df = PubTator.readDataset(ResourceHelper.spark, "./src/test/resources/corpus_pubtator_sample.txt")
+    val df = PubTator().readDataset(ResourceHelper.spark, "./src/test/resources/corpus_pubtator_sample.txt")
     CoNLLGenerator.exportConllFiles(df, "pubtator-conll-test")
     directory.deleteRecursively()
 
   }
 
+  "PubTator.readDataset" should "create conll-friendly dataframe with not padding" taggedAs FastTest in {
+
+    //remove file if it's already there
+    val directory = new Directory(new File("./pubtator_not_padding-conll-test"))
+    directory.deleteRecursively()
+    val df = PubTator().readDataset(ResourceHelper.spark, "src/test/resources/corpus_pubtator_not_padding.txt",false)
+    CoNLLGenerator.exportConllFiles(df, "pubtator_not_padding-conll-test")
+    directory.deleteRecursively()
+
+  }
+
 }