From 4186c11c5a6fe7b47816ffd6d327c122444586de Mon Sep 17 00:00:00 2001 From: xusliebana Date: Fri, 18 Jun 2021 19:32:29 +0200 Subject: [PATCH] python pubtator fix --- python/sparknlp/training.py | 4 ++-- .../johnsnowlabs/nlp/training/PubTator.scala | 14 ++++++++------ .../resources/corpus_pubtator_not_padding.txt | 19 +++++++++++++++++++ .../nlp/util/PubTatorTestSpec.scala | 13 ++++++++++++- 4 files changed, 41 insertions(+), 9 deletions(-) create mode 100644 src/test/resources/corpus_pubtator_not_padding.txt diff --git a/python/sparknlp/training.py b/python/sparknlp/training.py index 5f3b35d9c8eb15..00bf59d240423a 100644 --- a/python/sparknlp/training.py +++ b/python/sparknlp/training.py @@ -80,11 +80,11 @@ class PubTator(ExtendedJavaWrapper): def __init__(self): super(PubTator, self).__init__("com.johnsnowlabs.nlp.training.PubTator") - def readDataset(self, spark, path): + def readDataset(self, spark, path,ispadding=True): # ToDo Replace with std pyspark jSession = spark._jsparkSession - jdf = self._java_obj.readDataset(jSession, path) + jdf = self._java_obj.readDataset(jSession, path,ispadding) return DataFrame(jdf, spark._wrapped) diff --git a/src/main/scala/com/johnsnowlabs/nlp/training/PubTator.scala b/src/main/scala/com/johnsnowlabs/nlp/training/PubTator.scala index b94c00c52b170b..ad402981fb3d30 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/training/PubTator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/training/PubTator.scala @@ -8,9 +8,9 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SparkSession} -object PubTator { +case class PubTator() { - def readDataset(spark: SparkSession, path: String): DataFrame = { + def readDataset(spark: SparkSession, path: String,isPaddedToken: Boolean = true): DataFrame = { val pubtator = spark.sparkContext.textFile(path) val titles = pubtator.filter(x => x.contains("|a|") | x.contains("|t|")) val titlesText = titles.map(x => x.split("\\|")).groupBy(_.head) @@ -37,16 +37,18 @@ object PubTator { val tokenAnnotations = tokens.map(Annotation(_)) val labelAnnotations = chunkLabels.map(Annotation(_)) tokenAnnotations.map(ta => { - val tokenLabel = labelAnnotations.filter(la => la.begin <= ta.begin && la.end >= ta.end).headOption + val tokenLabel = labelAnnotations.find(la => la.begin <= ta.begin && la.end >= ta.end) val tokenTag = { if (tokenLabel.isEmpty) "O" else { - val tokenCSV = tokenLabel.get.metadata.get("entity").get + val tokenCSV = tokenLabel.get.metadata("entity") if (tokenCSV == "UnknownType") "O" else { val tokenPrefix = if (ta.begin == tokenLabel.get.begin) "B-" else "I-" - val paddedTokenTag = "T" + "%03d".format(tokenCSV.split(",")(0).slice(1, 4).toInt) - tokenPrefix + paddedTokenTag + val token = if (isPaddedToken) { + "T" + "%03d".format(tokenCSV.split(",")(0).slice(1, 4).toInt) + } else tokenCSV + tokenPrefix + token } } } diff --git a/src/test/resources/corpus_pubtator_not_padding.txt b/src/test/resources/corpus_pubtator_not_padding.txt new file mode 100644 index 00000000000000..a1a0f635bb8aa4 --- /dev/null +++ b/src/test/resources/corpus_pubtator_not_padding.txt @@ -0,0 +1,19 @@ +22016685|t|A novel missense mutation Asp506Gly in Exon 13 of the F11 gene in an asymptomatic Korean woman with mild factor XI deficiency. +22016685|a|Factor XI (FXI) deficiency is a rare autosomal recessive coagulation disorder most commonly found in Ashkenazi and Iraqi Jews, but it is also found in other ethnic groups. It is a trauma or surgery-related bleeding disorder, but spontaneous bleeding is rarely seen. The clinical manifestation of bleeding in FXI deficiency cases is variable and seems to poorly correlate with plasma FXI levels. The molecular pathology of FXI deficiency is mutation in the F11 gene on the chromosome band 4q35. We report a novel mutation of the F11 gene in an 18-year-old asymptomatic Korean woman with mild FXI deficiency. Pre-operative laboratory screen tests for lipoma on her back revealed slightly prolonged activated partial thromboplastin time (45.2 sec; reference range, 23.2-39.4 sec). Her FXI activity (35%) was slightly lower than the normal FXI activity (reference range, 50-150%). Direct sequence analysis of the F11 gene revealed a heterozygous A to G substitution in nucleotide 1517 (c.1517A>G) of exon 13, resulting in the substitution of aspartic acid with glycine in codon 506 (p.Asp506Gly). To the best of our knowledge, the Asp506Gly is a novel missense mutation, and this is the first genetically confirmed case of mild FXI deficiency in Korea. +22016685 26 35 Asp506Gly ProteinMutation p|SUB|D|506|G +22016685 1109 1118 c.1517A>G DNAMutation c|SUB|A|1517|G +22016685 1206 1217 p.Asp506Gly ProteinMutation p|SUB|D|506|G +22016685 1254 1263 Asp506Gly ProteinMutation p|SUB|D|506|G + +21850008|t|Mutations in mitochondrially encoded complex I enzyme as the second common cause in a cohort of Chinese patients with mitochondrial myopathy, encephalopathy, lactic acidosis and stroke-like episodes. +21850008|a|The mutation pattern of mitochondrial DNA (mtDNA) in mainland Chinese patients with mitochondrial myopathy, encephalopathy, lactic acidosis and stroke-like episodes (MELAS) has been rarely reported, though previous data suggested that the mutation pattern of MELAS could be different among geographically localized populations. We presented the results of comprehensive mtDNA mutation analysis in 92 unrelated Chinese patients with MELAS (85 with classic MELAS and 7 with MELAS/Leigh syndrome (LS) overlap syndrome). The mtDNA A3243G mutation was the most common causal genotype in this patient group (79/92 and 85.9%). The second common gene mutation was G13513A (7/92 and 7.6%). Additionally, we identified T10191C (p.S45P) in ND3, A11470C (p. K237N) in ND4, T13046C (p.M237T) in ND5 and a large-scale deletion (13025-13033:14417-14425) involving partial ND5 and ND6 subunits of complex I in one patient each. Among them, A11470C, T13046C and the single deletion were novel mutations. In summary, patients with mutations affecting mitochondrially encoded complex I (MTND) reached 12.0% (11/92) in this group. It is noteworthy that all seven patients with MELAS/LS overlap syndrome were associated with MTND mutations. Our data emphasize the important role of MTND mutations in the pathogenicity of MELAS, especially MELAS/LS overlap syndrome. +21850008 727 733 A3243G DNAMutation m|SUB|A|3243|G +21850008 856 863 G13513A DNAMutation m|SUB|G|13513|A +21850008 909 916 T10191C DNAMutation m|SUB|T|10191|C +21850008 918 924 p.S45P ProteinMutation p|SUB|S|45|P +21850008 934 941 A11470C DNAMutation m|SUB|A|11470|C +21850008 943 951 p. K237N ProteinMutation p|SUB|K|237|N +21850008 961 968 T13046C DNAMutation m|SUB|T|13046|C +21850008 970 977 p.M237T ProteinMutation p|SUB|M|237|T +21850008 1124 1131 A11470C DNAMutation m|SUB|A|11470|C +21850008 1133 1140 T13046C DNAMutation m|SUB|T|13046|C \ No newline at end of file diff --git a/src/test/scala/com/johnsnowlabs/nlp/util/PubTatorTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/util/PubTatorTestSpec.scala index fbf30eb20f1704..459eb69e4a356b 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/util/PubTatorTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/util/PubTatorTestSpec.scala @@ -17,10 +17,21 @@ class PubTatorTestSpec extends FlatSpec{ //remove file if it's already there val directory = new Directory(new File("./pubtator-conll-test")) directory.deleteRecursively() - val df = PubTator.readDataset(ResourceHelper.spark, "./src/test/resources/corpus_pubtator_sample.txt") + val df = PubTator().readDataset(ResourceHelper.spark, "./src/test/resources/corpus_pubtator_sample.txt") CoNLLGenerator.exportConllFiles(df, "pubtator-conll-test") directory.deleteRecursively() } + "PubTator.readDataset" should "create conll-friendly dataframe with not padding" taggedAs FastTest in { + + //remove file if it's already there + val directory = new Directory(new File("./pubtator_not_padding-conll-test")) + directory.deleteRecursively() + val df = PubTator().readDataset(ResourceHelper.spark, "src/test/resources/corpus_pubtator_not_padding.txt",false) + CoNLLGenerator.exportConllFiles(df, "pubtator_not_padding-conll-test") + directory.deleteRecursively() + + } + }