In [1]:
%run notebook_setup.ipynb

In [2]:
%vault from pubmed_derived_data import literature

Imported `literature` (904B0F94) at Saturday, 25. Jul 2020 15:56

In [3]:
code_repositories = literature.index.to_frame().drop(columns='uid').copy()

## Code archives and repositories extraction

In [4]:
from re import escape
from pandas import Series

from repository_detection import (
    source_code_platforms, mixed_publication_platforms, data_only_platforms,
    all_platforms as platforms
)

In [5]:
source_code_platforms

{'github': '(github\\.com/\\S+/\\S+)',
 'gitlab': '(gitlab\\.com/\\S+/\\S+)',
 'sourceforge': '(sourceforge\\.net/\\S+)',
 'bitbucket': '(bitbucket\\.org/\\S+)',
 '.git': '(\\S+:\\S+\\.git\\S*)',
 'cran': 'cran\\.r-project\\.org/(?:web/packages/|package=)(\\S+)(?:/\\b|/$|\\s|\\.$|\\)|$)',
 'pypi': 'pypi\\.python\\.org/pypi/(\\S+)(?:/\\b|/$|\\s|\\.$|\\)|$)'}

In [6]:
mixed_publication_platforms

{'zenodo': 'doi\\.org/10.5281/(zenodo\\.\\d+?)(?:/\\b|/$|\\s|\\.$|\\)|$)',
 'bioconductor': 'bioconductor.org/packages/(\\S+)(?:/\\b|/$|\\s|\\.$|\\)|$)',
 'osf': 'osf.io/(\\S+){slash_or_end}'}

In [7]:
data_only_platforms

{'dryad': 'datadryad.org/(\\S+)(?:/\\b|/$|\\s|\\.$|\\)|$)'}

In [8]:
def collapse_lists(lists):
    return sorted(set(sum(lists, [])))


fields = Series(['abstract', 'full_text'])

for platform in platforms:
    for field in fields:
        code_repositories[f'{field}_mentions_{platform}'] = literature[field].str.lower().str.contains(escape(platform)) == True
        matches = literature[field].astype(str).str.findall(platforms[platform])
        code_repositories[f'{field}_{platform}_matches'] = matches
    code_repositories[f'mentions_{platform}'] = code_repositories[fields + f'_mentions_{platform}'].any(axis=1)
    code_repositories[f'{platform}_matches'] = code_repositories[fields + f'_{platform}_matches'].apply(collapse_lists, axis=1)

Ext-link handling is not needed for abstracts:

In [9]:
assert not literature.abstract.str.contains('ext-link').any()

In [10]:
code_repositories[[f'abstract_mentions_{platform}' for platform in platforms]].sum()

abstract_mentions_github          54
abstract_mentions_gitlab           2
abstract_mentions_sourceforge      4
abstract_mentions_bitbucket        3
abstract_mentions_.git             2
abstract_mentions_cran            19
abstract_mentions_pypi             2
abstract_mentions_zenodo           1
abstract_mentions_bioconductor    16
abstract_mentions_osf              1
abstract_mentions_dryad            0
dtype: int64

In [11]:
code_repositories[[f'full_text_mentions_{platform}' for platform in platforms]].sum()

full_text_mentions_github          205
full_text_mentions_gitlab            5
full_text_mentions_sourceforge      44
full_text_mentions_bitbucket        15
full_text_mentions_.git             54
full_text_mentions_cran             68
full_text_mentions_pypi              5
full_text_mentions_zenodo           13
full_text_mentions_bioconductor    106
full_text_mentions_osf               4
full_text_mentions_dryad             1
dtype: int64

The sourceforge mentions might be uses of tools.

In [12]:
code_repositories[[f'mentions_{platform}' for platform in platforms]].sum()

mentions_github          245
mentions_gitlab            5
mentions_sourceforge      46
mentions_bitbucket        17
mentions_.git             56
mentions_cran             82
mentions_pypi              7
mentions_zenodo           14
mentions_bioconductor    116
mentions_osf               5
mentions_dryad             1
dtype: int64

In [13]:
code_repositories[[f'{platform}_matches' for platform in platforms]].sum()

github_matches          [github.com/Magdoll//ECE, github.com/wizardfan...
gitlab_matches          [gitlab.com/Gustafsson-lab/lassim, gitlab.com/...
sourceforge_matches     [sourceforge.net/., sourceforge.net/projects/a...
bitbucket_matches       [bitbucket.org/hbc/galaxy-central-hbc, bitbuck...
.git_matches            [http://networkx.github.io/, http://broadinsti...
cran_matches            [gplots/index.html, iSubpathwayMiner/, mixOmic...
pypi_matches                             [omics_pipe), MACS2, multiview.]
zenodo_matches          [zenodo.35611, zenodo.546110, zenodo.1154124, ...
bioconductor_matches    [release/data/annotation/html/hgu133plus2.db.h...
osf_matches                                                            []
dryad_matches                                                          []
dtype: object

In [14]:
Series(code_repositories['.git_matches'].sum()).sorted_value_counts()

index
http://broadinstitute.github.io/picard/                                                                                              11
http://broadinstitute.github.io/picard                                                                                                4
http://cole-trapnell-lab.github.io/cufflinks/                                                                                         3
https://broadinstitute.github.io/picard/                                                                                              3
http://zwdzwd.github.io/InfiniumAnnotation                                                                                            2
https://trinotate.github.io                                                                                                           2
http://cole-trapnell-lab.github.io/cufflinks/cuffmerge/                                                                               1
http://cole-trapnell-lab.github.io/cufflin

### Limitations

 - does not account for GitLab instances in custom domains

## Screen for code availability statements

We can compose a list of common code availability phrases and use that to see what are the other ways of sharing code.

In [15]:
journals_with_most_papers = literature['journal'].sorted_value_counts().head(20)
journals_with_most_papers.head(10)

index
Scientific reports                              126
Omics : a journal of integrative biology         78
PloS one                                         69
Bioinformatics (Oxford, England)                 68
Nature communications                            58
Frontiers in genetics                            55
Journal of proteomics                            53
BMC bioinformatics                               52
Nucleic acids research                           45
Methods in molecular biology (Clifton, N.J.)     43
Name: journal, dtype: int64

In [16]:
literature = literature.drop(columns=list(set(literature.columns) & set(code_repositories.columns)))

In [17]:
likely_to_contain_code = Series(list(source_code_platforms) + list(mixed_publication_platforms))
any_source_link = code_repositories['abstract_' + likely_to_contain_code + '_matches'].any(axis=1)
any_source_link.sum()

72

Note: this does not have to their source code - can be of a tool they used!

In [18]:
literature['has_source_code_link'] = any_source_link

In [19]:
import pandas
pandas.set_option('display.max_colwidth', 500)
from pandas import DataFrame
from helpers.n_grams import find_longest_common_n_grams


statments = []

for journal in journals_with_most_papers.index:
    relevant_papers = literature[(literature.journal == journal) & literature.has_source_code_link]
    relevant_papers_with_abstracts = relevant_papers[~relevant_papers.abstract.isnull()]

    statments.append({
        'journal': journal,
        'papers_with_code_link': len(relevant_papers),
        'common_n_grams': find_longest_common_n_grams(
            data=relevant_papers_with_abstracts.abstract,
            min_words=3, max_words=10,
            min_count=3, min_frequency=0.5
        )
    })

DataFrame(statments)

Unnamed: 0,journal,papers_with_code_link,common_n_grams
0,Scientific reports,0,[]
1,Omics : a journal of integrative biology,0,[]
2,PloS one,0,[]
3,"Bioinformatics (Oxford, England)",42,"[availability and implementation, https github com, supplementary information supplementary data are available at bioinformatics online]"
4,Nature communications,0,[]
5,Frontiers in genetics,5,"[available at https github com, co expression modules, deep learning based, dysfunctional subpathways we, is available at https, multi omics data, number of subtypes]"
6,Journal of proteomics,0,[]
7,BMC bioinformatics,6,"[available at https, chronic lymphocytic leukaemia, is freely available]"
8,Nucleic acids research,0,[]
9,"Methods in molecular biology (Clifton, N.J.)",1,[]


Only _Bioinformatics (Oxford, England)_, _Frontiers in genetics_ and _BMC bioinformatics_ had n-grams which looked like code availability statements.

In [20]:
compact_code_links_summary = (
    code_repositories['abstract_' + Series(list(platforms)) + '_matches']
    .rename(columns=lambda c: c.replace('abstract_', '').replace('_matches', ''))
    .apply(lambda x: x[x.apply(len) != 0].to_dict(), axis=1)
    .to_frame('detected_code_links')
)

In [21]:
def get_statments(expression):
    return (
        literature['abstract'].str.lower()
        .str.extract(expression).dropna()
        .rename(columns={0: 'match'})
        .join(compact_code_links_summary)
        .join(literature)[['match', 'detected_code_links', 'journal']]
        .rename(columns=lambda x: x.replace('abstract_', ''))
    )

In [22]:
pandas.set_option('display.max_colwidth', 100)

### BMC bioinformatics

In [23]:
get_statments('is freely available (.*?)\n')

Unnamed: 0_level_0,match,detected_code_links,journal
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17468122,at http://integromics.kobic.re.kr/gazer/.,{},"Bioinformatics (Oxford, England)"
19706745,from http://cran.r-project.org/ or from the web site companion (http://math.univ-toulouse.fr/bio...,{},"Bioinformatics (Oxford, England)"
26568631,at http://biosignal.med.upatras.gr/chronos/,{},"Bioinformatics (Oxford, England)"
27814671,as a bioconductor r package.,{},BMC genomics
28961954,"on request. 50 gb of space is allocated for data storage, with unrestricted number of samples an...",{'github': ['github.com/RonanDaly/pimp']},"Bioinformatics (Oxford, England)"
29931190,at https://github.com/mkanai/grimon as an r package with example omics data sets.,{'github': ['github.com/mkanai/grimon']},"Bioinformatics (Oxford, England)"
30202885,at https://github.com/xuesidong/tobmi.,{'github': ['github.com/XuesiDong/TOBMI.']},"Bioinformatics (Oxford, England)"
30598101,at http://bioinfo.au.tsinghua.edu.cn/jianglab/csnets/ .,{},BMC genomics
30863842,for downloading from https://github.com/pfruan/absnf.,{'github': ['github.com/pfruan/abSNF.']},"Bioinformatics (Oxford, England)"
30957844,on the web at https://cran.r-project.org/web/packages/smccnet/index.html.,{'cran': ['SmCCNet/index.html.']},"Bioinformatics (Oxford, England)"


In [24]:
get_statments('available at (https.*?)\s')

Unnamed: 0_level_0,match,detected_code_links,journal
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26377073,https://github.com/yangzi4/inmf.,{'github': ['github.com/yangzi4/iNMF.']},"Bioinformatics (Oxford, England)"
26961692,https://github.com/maxconway/snftool,{'github': ['github.com/maxconway/SNFtool']},BMC bioinformatics
28082455,https://github.com/olganikolova/gbgfa.,{'github': ['github.com/olganikolova/gbgfa.']},"Bioinformatics (Oxford, England)"
28640810,https://gitlab.com/gustafsson-lab/lassim.,{'gitlab': ['gitlab.com/Gustafsson-lab/lassim.']},PLoS computational biology
28961954,https://github.com/ronandaly/pimp,{'github': ['github.com/RonanDaly/pimp']},"Bioinformatics (Oxford, England)"
29069501,https://jmorp.megabank.tohoku.ac.jp.,{},Nucleic acids research
29186355,https://github.com/mehr-een/bemkl-rbps.,{'github': ['github.com/mehr-een/bemkl-rbps.']},"Bioinformatics (Oxford, England)"
29547932,https://github.com/cbg-ethz/netics.,{'github': ['github.com/cbg-ethz/netics.']},"Bioinformatics (Oxford, England)"
29931190,https://github.com/mkanai/grimon,{'github': ['github.com/mkanai/grimon']},"Bioinformatics (Oxford, England)"
30202885,https://github.com/xuesidong/tobmi.,{'github': ['github.com/XuesiDong/TOBMI.']},"Bioinformatics (Oxford, England)"


### Bioinformatics (Oxford)

In [25]:
header = 'availability and implementation'

get_statments(f'{escape(header)}\n(.*?)\n')

Unnamed: 0_level_0,match,detected_code_links,journal
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25527095,the netgestalt crc portal can be freely accessed at http://www.netgestalt.org.,{},"Bioinformatics (Oxford, England)"
26377073,the source code repository is publicly available at https://github.com/yangzi4/inmf.,{'github': ['github.com/yangzi4/iNMF.']},"Bioinformatics (Oxford, England)"
26568631,chronos is freely available at http://biosignal.med.upatras.gr/chronos/,{},"Bioinformatics (Oxford, England)"
26787660,genetrail2 can be freely accessed under https://genetrail2.bioinf.uni-sb.de,{},"Bioinformatics (Oxford, England)"
26883487,"the source code, required data for prediction, and demo data for test are freely available at: h...",{},"Bioinformatics (Oxford, England)"
27797774,http://metagenomics.atc.tcs.com/webigloo and http://121.241.184.233/webigloo [freely available f...,{},"Bioinformatics (Oxford, England)"
28082455,: the code for this work is available at https://github.com/olganikolova/gbgfa.,{'github': ['github.com/olganikolova/gbgfa.']},"Bioinformatics (Oxford, England)"
28334215,matlab codes for flux balance analysis in this study are available in supplementary material.,{},"Bioinformatics (Oxford, England)"
28407042,the source code is at https://github.com/zhangxf-ccnu/pdna.,{'github': ['github.com/Zhangxf-ccnu/pDNA.']},"Bioinformatics (Oxford, England)"
28520848,"pfa has been implemented as a matlab package, which is available at http://www.sysbio.ac.cn/cb/c...",{},"Bioinformatics (Oxford, England)"


## Store the data

In [26]:
%vault store code_repositories in pubmed_derived_data

Stored `code_repositories` (C557967A → 5FF4AA2D) at Saturday, 25. Jul 2020 15:56