Memory usage is too high on popular author selection

Output from memory profiler for the 'ENCODE' query.
Note that `popular_authors` function results in 2.5GB memory allocation.

```
Line #    Mem usage    Increment   Line Contents
================================================
    35    203.3 MiB    203.3 MiB       @profile
    36                                 def launch(self, *terms, task=None):
    37                                     """:return full log"""
    38                             
    39    203.3 MiB      0.0 MiB           try:
    40                                         # Search articles relevant to the terms
    41    203.3 MiB      0.0 MiB               self.terms = terms
    42    219.7 MiB     16.3 MiB               self.ids = self.loader.search(*terms, current=1, task=task)
    43    219.7 MiB      0.0 MiB               self.n_papers = len(self.ids)
    44                             
    45                                         # Nothing found
    46    219.7 MiB      0.0 MiB               if self.n_papers == 0:
    47                                             raise RuntimeError("Nothing found")
    48                             
    49                                         # Load data about publications, citations and co-citations
    50    670.1 MiB    450.5 MiB               self.pub_df = self.loader.load_publications(current=2, task=task)
    51    670.1 MiB      0.0 MiB               if len(self.pub_df) == 0:
    52                                             raise RuntimeError("Nothing found in DB")
    53                             
    54    687.6 MiB     17.5 MiB               cit_stats_df_from_query = self.loader.load_citation_stats(current=3, task=task)
    55    691.8 MiB      4.2 MiB               self.cit_stats_df = self.build_cit_df(cit_stats_df_from_query, self.n_papers, current=3.5, task=task)
    56    691.8 MiB      0.0 MiB               if len(self.cit_stats_df) == 0:
    57                                             raise RuntimeError("Citations stats not found DB")
    58                             
    59    691.8 MiB      0.0 MiB               self.df, self.min_year, self.max_year, self.citation_years = self.merge_citation_stats(self.pub_df,
    60    714.3 MiB     22.5 MiB                                                                                                      self.cit_stats_df)
    61    714.3 MiB      0.0 MiB               if len(self.df) == 0:
    62                                             raise RuntimeError("Failed to merge publications and citations")
    63                             
    64    890.8 MiB    176.5 MiB               self.cocit_df = self.loader.load_cocitations(current=4, task=task)
    65    966.6 MiB     75.8 MiB               cocit_grouped_df = self.build_cocit_grouped_df(self.cocit_df)
    66    971.8 MiB      5.1 MiB               self.CG = self.build_cocitation_graph(cocit_grouped_df, current=5, task=task)
    67    971.8 MiB      0.0 MiB               if len(self.CG.nodes()) == 0:
    68                                             raise RuntimeError("Failed to build co-citations graph")
    69                             
    70                                         # Perform subtopic analysis and get subtopic descriptions
    71    971.8 MiB      0.0 MiB               self.df, self.components, self.comp_other, self.pm, self.pmcomp_sizes = self.subtopic_analysis(
    72   1001.8 MiB     30.0 MiB                   self.df, self.CG, current=7, task=task
    73                                         )
    74   1097.1 MiB     95.3 MiB               self.df_kwd = self.subtopic_descriptions(self.df)
    75                             
    76                                         # Find interesting papers
    77   1097.1 MiB      0.0 MiB               self.top_cited_papers, self.top_cited_df = self.find_top_cited_papers(self.df, current=8, task=task)
    78                             
    79   1097.1 MiB      0.0 MiB               self.max_gain_papers, self.max_gain_df = self.find_max_gain_papers(self.df, self.citation_years,
    80   1097.1 MiB      0.0 MiB                                                                                  current=9, task=task)
    81                             
    82   1097.1 MiB      0.0 MiB               self.max_rel_gain_papers, self.max_rel_gain_df = self.find_max_relative_gain_papers(
    83   1140.3 MiB     43.2 MiB                   self.df, self.citation_years, current=10, task=task
    84                                         )
    85                             
    86                                         # Perform subtopic evolution analysis and get subtopic descriptions
    87   1140.3 MiB      0.0 MiB               self.evolution_df, self.evolution_year_range = self.subtopic_evolution_analysis(self.cocit_df, current=11,
    88   1171.7 MiB     31.4 MiB                                                                                               task=task)
    89   1171.7 MiB      0.0 MiB               self.evolution_kwds = self.subtopic_evolution_descriptions(self.df, self.evolution_df,
    90   1567.8 MiB    396.1 MiB                                                                          self.evolution_year_range, self.terms)
    91                             
    92                                         # Find top journals
    93   1218.3 MiB      0.0 MiB               self.journal_stats = self.popular_journals(self.df, current=12, task=task)
    94                             
    95                                         # Find top authors
    96   3715.9 MiB   2497.7 MiB               self.author_stats = self.popular_authors(self.df, current=13, task=task)
    97                             
    98   3715.9 MiB      0.0 MiB               return self.logger.stream.getvalue()
    99                                     finally:
   100   3715.9 MiB      0.0 MiB               self.loader.close_connection()
   101   3715.9 MiB      0.0 MiB               self.logger.remove_handler()
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Memory usage is too high on popular author selection #119

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Memory usage is too high on popular author selection #119

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions