"""
Classes that represent different GEO entities
"""
import os
import re
import abc
import gzip
import json
import time
import numpy as np
import platform
from pandas import DataFrame, concat
from multiprocessing import Pool
try:
from urllib.error import HTTPError
except ImportError:
from urllib2 import HTTPError
from six import iteritems, itervalues
from . import utils
from .sra_downloader import SRADownloader
from .logger import geoparse_logger as logger
def _sra_download_worker(*args):
"""A worker to download SRA files.
To be used with multiprocessing.
"""
gsm = args[0][0]
email = args[0][1]
dirpath = args[0][2]
kwargs = args[0][3]
return (gsm.get_accession(), gsm.download_SRA(email, dirpath, **kwargs))
def _supplementary_files_download_worker(*args):
"""A worker to download supplementary files.
To be used with multiprocessing.
"""
gsm = args[0][0]
download_sra = args[0][1]
email = args[0][2]
dirpath = args[0][3]
sra_kwargs = args[0][4]
return (gsm.get_accession(), gsm.download_supplementary_files(
directory=dirpath,
download_sra=download_sra,
email=email, **sra_kwargs))
[docs]class DataIncompatibilityException(Exception):
pass
[docs]class BaseGEO(object):
__metaclass__ = abc.ABCMeta
geotype = None
def __init__(self, name, metadata):
"""Initialize base GEO object.
Args:
name (:obj:`str`): Name of the object.
metadata (:obj:`dict`): Metadata information.
Raises:
TypeError: Metadata should be a dict.
"""
if not isinstance(metadata, dict):
raise TypeError("Metadata should be a dictionary not a %s" % str(
type(metadata)))
self.name = name
self.metadata = metadata
self.relations = {}
if 'relation' in self.metadata:
for relation in self.metadata['relation']:
tmp = re.split(r':\s+', relation)
relname = tmp[0]
relval = tmp[1]
if relname in self.relations:
self.relations[relname].append(relval)
else:
self.relations[relname] = [relval]
[docs] def get_accession(self):
"""Return accession ID of the sample.
Returns:
:obj:`str`: GEO accession ID
"""
return self.get_metadata_attribute("geo_accession")
[docs] def get_type(self):
"""Get the type of the GEO.
Returns:
:obj:`str`: Type attribute of the GEO
"""
try:
return self.get_metadata_attribute("type")
except NoMetadataException:
return None
def _get_metadata_as_string(self):
"""Get the metadata as SOFT formatted string."""
metalist = []
for metaname, meta in iteritems(self.metadata):
message = "Single value in metadata dictionary should be a list!"
assert isinstance(meta, list), message
for data in meta:
if data:
metalist.append("!%s_%s = %s" % (self.geotype.capitalize(),
metaname, data))
return "\n".join(metalist)
[docs] def to_soft(self, path_or_handle, as_gzip=False):
"""Save the object in a SOFT format.
Args:
path_or_handle (:obj:`str` or :obj:`file`): Path or handle to
output file
as_gzip (:obj:`bool`): Save as gzip
"""
if isinstance(path_or_handle, str):
if as_gzip:
with gzip.open(path_or_handle, 'wt') as outfile:
outfile.write(self._get_object_as_soft())
else:
with open(path_or_handle, 'w') as outfile:
outfile.write(self._get_object_as_soft())
else:
path_or_handle.write(self._get_object_as_soft())
@abc.abstractmethod
def _get_object_as_soft(self):
"""Get the object as SOFT formatted string."""
raise NotImplementedError("Method not implemented")
def __str__(self):
return str("<%s: %s>" % (self.geotype, self.name))
def __repr__(self):
return str("<%s: %s>" % (self.geotype, self.name))
[docs]class SimpleGEO(BaseGEO):
__metaclass__ = abc.ABCMeta
def __init__(self, name, metadata, table, columns):
"""Initialize simple GEO object.
Args:
name (:obj:`str`): Name of the object
metadata (:obj:`dict`): Metadata information
table (:obj:`pandas.DataFrame`): Table with the data from SOFT file
columns (:obj:`pandas.DataFrame`): Description of the columns,
number of columns, order and names represented as index
in this DataFrame has to be the same as table.columns.
Raises:
:obj:`ValueError`: Table should be a DataFrame
:obj:`ValueError`: Columns' description should be a DataFrame
:obj:`DataIncompatibilityException`: Columns are wrong
:obj:`ValueError`: Description has to be present in columns
"""
if not isinstance(table, DataFrame):
raise ValueError(("Table data should be an instance of "
"pandas.DataFrame not %s") % str(type(table)))
if not isinstance(columns, DataFrame):
raise ValueError(("Columns description should be an instance of "
"pandas.DataFrame not %s") % str(type(columns)))
BaseGEO.__init__(self, name=name, metadata=metadata)
self.table = table
self.columns = columns
columns_are_correct = False
if self.columns.index.tolist() != self.table.columns.tolist():
if not self.columns.index.is_unique:
# try to correct duplicate index in the same way pandas is
# doing the columns
logger.warning(
"Detected duplicated columns in %s %s. Correcting.\n" % (
self.geotype,
self.name))
indices = {}
new_index = []
for idx in self.columns.index:
if idx not in indices:
indices[idx] = 0
new_index.append(idx)
else:
indices[idx] += 1
new_index.append("%s.%i" % (idx, indices[idx]))
self.columns.index = new_index
if self.columns.index.tolist() == self.table.columns.tolist():
columns_are_correct = True
if not columns_are_correct:
# if the columns are still not correct check the order.
if sorted(self.columns.index.tolist()) == sorted(self.table.columns.tolist()):
logger.warning(
"Data columns in %s %s are not in order. Reordering.\n" % (
self.geotype,
self.name))
self.columns = self.columns.loc[self.table.columns]
if self.columns.index.tolist() == self.table.columns.tolist():
columns_are_correct = True
else:
columns_are_correct = True
if not columns_are_correct:
rows_in_columns = ", ".join(self.columns.index.tolist())
columns_in_table = ", ".join(self.table.columns.tolist())
raise DataIncompatibilityException(
"\nData columns do not match columns description index in %s\n" % (
self.name) +
"Columns in table are: %s\n" % columns_in_table +
"Index in columns are: %s\n" % rows_in_columns)
if self.columns.columns[0] != 'description':
raise ValueError(("Columns table must contain a column named"
"'description'. Here columns are: %s") % (
", ".join(map(str, self.columns.columns))))
[docs] def head(self):
"""Print short description of the object."""
summary = list()
summary.append("%s %s" % (self.geotype, self.name) + "\n")
summary.append(" - Metadata:" + "\n")
summary.append(
"\n".join(self._get_metadata_as_string().split("\n")[:5]) + "\n")
summary.append("\n")
summary.append(" - Columns:" + "\n")
summary.append(self.columns.to_string() + "\n")
summary.append("\n")
summary.append(" - Table:" + "\n")
summary.append(
"\t".join(["Index"] + self.table.columns.tolist()) + "\n")
summary.append(self.table.head().to_string(header=None) + "\n")
summary.append(" " * 40 + "..." + " " * 40 + "\n")
summary.append(" " * 40 + "..." + " " * 40 + "\n")
summary.append(" " * 40 + "..." + " " * 40 + "\n")
summary.append(self.table.tail().to_string(header=None) + "\n")
return "\n".join([str(s) for s in summary])
[docs] def show_columns(self):
"""Print columns in SOFT format."""
print(self.columns)
[docs] def show_table(self, number_of_lines=5):
"""Show few lines of the table the table as pandas.DataFrame.
Args:
number_of_lines (:obj:`int`): Number of lines to show. Defaults to 5.
"""
print(self.table.head(number_of_lines))
def _get_object_as_soft(self):
"""Get the object as SOFT formated string."""
soft = ["^%s = %s" % (self.geotype, self.name),
self._get_metadata_as_string(),
self._get_columns_as_string(),
self._get_table_as_string()]
return "\n".join(soft)
def _get_table_as_string(self):
"""Get table as SOFT formated string."""
tablelist = []
tablelist.append("!%s_table_begin" % self.geotype.lower())
tablelist.append("\t".join(self.table.columns))
for idx, row in self.table.iterrows():
tablelist.append("\t".join(map(str, row)))
tablelist.append("!%s_table_end" % self.geotype.lower())
return "\n".join(tablelist)
def _get_columns_as_string(self):
"""Returns columns as SOFT formated string."""
columnslist = []
for rowidx, row in self.columns.iterrows():
columnslist.append("#%s = %s" % (rowidx, row.description))
return "\n".join(columnslist)
[docs]class GSM(SimpleGEO):
"""Class that represents sample from GEO database."""
geotype = 'SAMPLE'
[docs] def annotate(self, gpl, annotation_column, gpl_on="ID", gsm_on="ID_REF",
in_place=False):
"""Annotate GSM with provided GPL
Args:
gpl (:obj:`pandas.DataFrame`): A Platform or DataFrame to annotate with
annotation_column (str`): Column in a table for annotation
gpl_on (:obj:`str`): Use this column in GSM to merge. Defaults to "ID".
gsm_on (:obj:`str`): Use this column in GPL to merge.
Defaults to "ID_REF".
in_place (:obj:`bool`): Substitute table in GSM by new annotated
table. Defaults to False.
Returns:
:obj:`pandas.DataFrame` or :obj:`None`: Annotated table or None
Raises:
TypeError: GPL should be GPL or pandas.DataFrame
"""
if isinstance(gpl, GPL):
annotation_table = gpl.table
elif isinstance(gpl, DataFrame):
annotation_table = gpl
else:
raise TypeError("gpl should be a GPL object or a pandas.DataFrame")
# annotate by merging
annotated = self.table.merge(
annotation_table[[gpl_on, annotation_column]], left_on=gsm_on,
right_on=gpl_on)
del annotated[gpl_on]
if in_place:
self.table = annotated
return None
else:
return annotated
[docs] def annotate_and_average(self, gpl, expression_column, group_by_column,
rename=True, force=False, merge_on_column=None,
gsm_on=None, gpl_on=None):
"""Annotate GSM table with provided GPL.
Args:
gpl (:obj:`GEOTypes.GPL`): Platform for annotations
expression_column (:obj:`str`): Column name which "expressions"
are represented
group_by_column (:obj:`str`): The data will be grouped and averaged
over this column and only this column will be kept
rename (:obj:`bool`): Rename output column to the
self.name. Defaults to True.
force (:obj:`bool`): If the name of the GPL does not match the platform
name in GSM proceed anyway. Defaults to False.
merge_on_column (:obj:`str`): Column to merge the data
on. Defaults to None.
gsm_on (:obj:`str`): In the case columns to merge are different in GSM
and GPL use this column in GSM. Defaults to None.
gpl_on (:obj:`str`): In the case columns to merge are different in GSM
and GPL use this column in GPL. Defaults to None.
Returns:
:obj:`pandas.DataFrame`: Annotated data
"""
if gpl.name != self.metadata['platform_id'][0] and not force:
raise KeyError("Platforms from GSM (%s) and from GPL (%s)" % (
gpl.name, self.metadata['platform_id']) +
" are incompatible. Use force=True to use this GPL.")
if merge_on_column is None and gpl_on is None and gsm_on is None:
raise Exception("You have to provide one of the two: "
"merge_on_column or gpl_on and gsm_on parameters")
if merge_on_column:
logger.info("merge_on_column is not None. Using this option.")
tmp_data = self.table.merge(gpl.table, on=merge_on_column,
how='outer')
tmp_data = tmp_data.groupby(group_by_column).mean()[
[expression_column]]
else:
if gpl_on is None or gsm_on is None:
raise Exception("Please provide both gpl_on and gsm_on or "
"provide merge_on_column only")
tmp_data = self.table.merge(gpl.table, left_on=gsm_on,
right_on=gpl_on, how='outer')
tmp_data = tmp_data.groupby(group_by_column).mean()[
[expression_column]]
if rename:
tmp_data.columns = [self.name]
return tmp_data
[docs] def download_supplementary_files(self, directory="./", download_sra=True,
email=None, sra_kwargs=None):
"""Download all supplementary data available for the sample.
Args:
directory (:obj:`str`): Directory to download the data (in this directory
function will create new directory with the files).
Defaults to "./".
download_sra (:obj:`bool`): Indicates whether to download SRA raw
data too. Defaults to True.
email (:obj:`str`): E-mail that will be provided to the Entrez.
It is mandatory if download_sra=True. Defaults to None.
sra_kwargs (:obj:`dict`, optional): Kwargs passed to the
download_SRA method. Defaults to None.
Returns:
:obj:`dict`: A key-value pair of name taken from the metadata and
paths downloaded, in the case of SRA files the key is ``SRA``.
"""
directory_path = os.path.abspath(
os.path.join(directory, "%s_%s_%s" % (
'Supp',
self.get_accession(),
# the directory name cannot contain many of the signs
re.sub(r'[\s\*\?\(\),\.;]', '_', self.metadata['title'][0]))))
utils.mkdir_p(os.path.abspath(directory_path))
downloaded_paths = dict()
if sra_kwargs is None:
sra_kwargs = {}
# Possible erroneous values that could be identified and skipped right
# after
blacklist = ('NONE',)
for metakey, metavalue in iteritems(self.metadata):
if 'supplementary_file' in metakey:
assert len(metavalue) == 1 and metavalue != ''
if metavalue[0] in blacklist:
logger.warning("%s value is blacklisted as '%s' - skipping"
% (metakey, metavalue[0]))
continue
# SRA will be downloaded elsewhere
if 'sra' not in metavalue[0]:
download_path = os.path.abspath(os.path.join(
directory,
os.path.join(directory_path,
metavalue[0].split("/")[-1])))
try:
utils.download_from_url(metavalue[0], download_path)
downloaded_paths[metavalue[0]] = download_path
except Exception as err:
logger.error(
"Cannot download %s supplementary file (%s)" % (
self.get_accession(), err))
if download_sra:
try:
downloaded_files = self.download_SRA(
email,
directory=directory,
**sra_kwargs)
downloaded_paths.update(downloaded_files)
except Exception as err:
logger.error("Cannot download %s SRA file (%s)" % (
self.get_accession(), err))
return downloaded_paths
[docs] def download_SRA(self, email, directory='./', **kwargs):
"""Download RAW data as SRA file.
The files will be downloaded to the sample directory created ad hoc
or the directory specified by the parameter. The sample has to come
from sequencing eg. mRNA-seq, CLIP etc.
An important parameter is a filetype. By default an SRA
is accessed by FTP and such file is downloaded. This does not
require additional libraries. However in order
to produce FASTA of FASTQ files one would need to use SRA-Toolkit.
Thus, it is assumed that this library is already installed or it
will be installed in the near future. One can immediately specify
the download type to fasta or fastq.
To see all possible ``**kwargs`` that could be passed to the function
see the description of :class:`~GEOparse.sra_downloader.SRADownloader`.
Args:
email (:obj:`str`): an email (any) - Required by NCBI for access
directory (:obj:`str`, optional): The directory to which download
the data. Defaults to "./".
**kwargs: Arbitrary keyword arguments, see description
Returns:
:obj:`dict`: A dictionary containing only one key (``SRA``) with
the list of downloaded files.
Raises:
:obj:`TypeError`: Type to download unknown
:obj:`NoSRARelationException`: No SRAToolkit
:obj:`Exception`: Wrong e-mail
:obj:`HTTPError`: Cannot access or connect to DB
"""
downloader = SRADownloader(self, email, directory, **kwargs)
return {"SRA": downloader.download()}
[docs]class GPL(SimpleGEO):
"""Class that represents platform from GEO database"""
geotype = "PLATFORM"
def __init__(self, name, metadata, table=None, columns=None, gses=None,
gsms=None, database=None):
"""Initialize GPL.
Args:
name (:obj:`str`): Name of the object
metadata (:obj:`dict`): Metadata information
table (:obj:`pandas.DataFrame`, optional): Table with actual GPL data
columns (:obj:`pandas.DataFrame`, optional): Table with description
of the columns. Defaults to None.
gses (:obj:`dict` of :obj:`GEOparse.GSE`, optional): A dictionary of
GSE objects. Defaults to None.
gsms (:obj:`dict` of :obj:`GEOparse.GSM`, optional): A dictionary of
GSM objects. Defaults to None.
database (:obj:`GEOparse.GEODatabase`, optional): A database object
from SOFT file associated with GPL. Defaults to None.
"""
gses = {} if gses is None else gses
if not isinstance(gses, dict):
raise ValueError(
"GSEs should be a dictionary not a %s" % str(type(gses)))
gsms = {} if gsms is None else gsms
if not isinstance(gsms, dict):
raise ValueError(
"GSMs should be a dictionary not a %s" % str(type(gsms)))
for gsm_name, gsm in iteritems(gsms):
assert isinstance(gsm, GSM), "All GSMs should be of type GSM"
for gse_name, gse in iteritems(gses):
assert isinstance(gse, GSE), "All GSEs should be of type GSE"
if database is not None:
if not isinstance(database, GEODatabase):
raise ValueError(
"Database should be a GEODatabase not a %s" % str(
type(database)))
table = DataFrame() if table is None else table
columns = DataFrame() if columns is None else columns
SimpleGEO.__init__(self, name=name, metadata=metadata, table=table,
columns=columns)
self.gses = gses
self.gsms = gsms
self.database = database
[docs]class GDSSubset(BaseGEO):
"""Class that represents a subset from GEO GDS object."""
geotype = "SUBSET"
def _get_object_as_soft(self):
"""Get the object as SOFT formatted string."""
soft = ["^%s = %s" % (self.geotype, self.name),
self._get_metadata_as_string()]
return "\n".join(soft)
[docs]class GEODatabase(BaseGEO):
"""Class that represents a subset from GEO GDS object."""
geotype = "DATABASE"
def _get_object_as_soft(self):
"""Return object as SOFT formatted string."""
soft = ["^%s = %s" % (self.geotype, self.name),
self._get_metadata_as_string()]
return "\n".join(soft)
[docs]class GDS(SimpleGEO):
"""Class that represents a dataset from GEO database"""
geotype = "DATASET"
def __init__(self, name, metadata, table, columns, subsets, database=None):
"""Initialize GDS
Args:
name (:obj:`str`): Name of the object.
metadata (:obj:`dict`): Metadata information.
table (:obj:`pandas.DataFrame`): Table with the data from SOFT file.
columns (:obj:`pandas.DataFrame`): description of the columns,
number of columns, order, and names represented as index in
this DataFrame has to be the same as table.columns.
subsets (:obj:`dict` of :obj:`GEOparse.GDSSubset`): GDSSubset from
GDS soft file.
database (:obj:`GEOparse.Database`, optional): Database from SOFT
file. Defaults to None.
"""
if not isinstance(subsets, dict):
raise ValueError(
"Subsets should be a dictionary not a %s" % str(type(subsets)))
if database is not None:
if not isinstance(database, GEODatabase):
raise ValueError(
"Database should be a GEODatabase not a %s" % str(
type(database)))
SimpleGEO.__init__(self, name=name, metadata=metadata, table=table,
columns=columns)
# effectively deletes the columns with ID_REF
self.columns = self.columns.dropna()
self.subsets = subsets
self.database = database
for subset_name, subset in iteritems(subsets):
message = "All subsets should be of type GDSSubset"
assert isinstance(subset, GDSSubset), message
def _get_object_as_soft(self):
"""Return object as SOFT formatted string."""
soft = []
if self.database is not None:
soft.append(self.database._get_object_as_soft())
soft += ["^%s = %s" % (self.geotype, self.name),
self._get_metadata_as_string()]
for subset in self.subsets.values():
soft.append(subset._get_object_as_soft())
soft += ["^%s = %s" % (self.geotype, self.name),
self._get_columns_as_string(),
self._get_table_as_string()]
return "\n".join(soft)
[docs]class GSE(BaseGEO):
"""Class representing GEO series"""
geotype = "SERIES"
def __init__(self, name, metadata, gpls=None, gsms=None, database=None):
"""Initialize GSE.
Args:
name (:obj:`str`): Name of the object.
metadata (:obj:`dict`): Metadata information.
gpls (:obj:`dict` of :obj:`GEOparse.GPL`, optional): A dictionary of
GSE objects. Defaults to None.
gsms (:obj:`dict` of :obj:`GEOparse.GSM`, optional): A dictionary of
GSM objects. Defaults to None.
database (:obj:`GEOparse.Database`, optional): Database from SOFT
file. Defaults to None.
"""
gpls = {} if gpls is None else gpls
if not isinstance(gpls, dict):
raise ValueError(
"GPLs should be a dictionary not a %s" % str(type(gpls)))
gsms = {} if gsms is None else gsms
if not isinstance(gsms, dict):
raise ValueError(
"GSMs should be a dictionary not a %s" % str(type(gsms)))
for gsm_name, gsm in iteritems(gsms):
assert isinstance(gsm, GSM), "All GSMs should be of type GSM"
for gpl_name, gpl in iteritems(gpls):
assert isinstance(gpl, GPL), "All GPLs should be of type GPL"
if database is not None:
if not isinstance(database, GEODatabase):
raise ValueError(
"Database should be a GEODatabase not a %s" % str(
type(database)))
BaseGEO.__init__(self, name=name, metadata=metadata)
self.gpls = gpls
self.gsms = gsms
self.database = database
self._phenotype_data = None
@property
def phenotype_data(self):
"""Get the phenotype data for each of the sample."""
if self._phenotype_data is None:
pheno_data = {}
for gsm_name, gsm in iteritems(self.gsms):
tmp = {}
for key, value in iteritems(gsm.metadata):
if len(value) == 0:
tmp[key] = np.nan
elif key.startswith("characteristics_"):
for i, char in enumerate(value):
char = re.split(":\s+", char)
char_type, char_value = [char[0],
": ".join(char[1:])]
tmp[key + "." + str(
i) + "." + char_type] = char_value
else:
tmp[key] = ",".join(value)
pheno_data[gsm_name] = tmp
self._phenotype_data = DataFrame(pheno_data).T
return self._phenotype_data
[docs] def merge_and_average(self, platform, expression_column, group_by_column,
force=False, merge_on_column=None, gsm_on=None,
gpl_on=None):
"""Merge and average GSE samples.
For given platform prepare the DataFrame with all the samples present in
the GSE annotated with given column from platform and averaged over
the column.
Args:
platform (:obj:`str` or :obj:`GEOparse.GPL`): GPL platform to use.
expression_column (:obj:`str`): Column name in which "expressions"
are represented
group_by_column (:obj:`str`): The data will be grouped and averaged
over this column and only this column will be kept
force (:obj:`bool`): If the name of the GPL does not match the
platform name in GSM proceed anyway
merge_on_column (:obj:`str`): Column to merge the data on - should
be present in both GSM and GPL
gsm_on (:obj:`str`): In the case columns to merge are different in
GSM and GPL use this column in GSM
gpl_on (:obj:`str`): In the case columns to merge are different in
GSM and GPL use this column in GPL
Returns:
:obj:`pandas.DataFrame`: Merged and averaged table of results.
"""
if isinstance(platform, str):
gpl = self.gpls[platform]
elif isinstance(platform, GPL):
gpl = platform
else:
raise ValueError("Platform has to be of type GPL or string with "
"key for platform in GSE")
data = []
for gsm in self.gsms.values():
if gpl.name == gsm.metadata['platform_id'][0]:
data.append(gsm.annotate_and_average(
gpl=gpl,
merge_on_column=merge_on_column,
expression_column=expression_column,
group_by_column=group_by_column,
force=force,
gpl_on=gpl_on,
gsm_on=gsm_on))
if len(data) == 0:
logger.warning("No samples for the platform were found\n")
return None
elif len(data) == 1:
return data[0]
else:
return data[0].join(data[1:])
[docs] def pivot_samples(self, values, index="ID_REF"):
"""Pivot samples by specified column.
Construct a table in which columns (names) are the samples, index
is a specified column eg. ID_REF and values in the columns are of one
specified type.
Args:
values (:obj:`str`): Column name present in all GSMs.
index (:obj:`str`, optional): Column name that will become an index in
pivoted table. Defaults to "ID_REF".
Returns:
:obj:`pandas.DataFrame`: Pivoted data
"""
data = []
for gsm in self.gsms.values():
tmp_data = gsm.table.copy()
tmp_data["name"] = gsm.name
data.append(tmp_data)
ndf = concat(data).pivot(index=index, values=values, columns="name")
return ndf
[docs] def pivot_and_annotate(self, values, gpl, annotation_column, gpl_on="ID",
gsm_on="ID_REF"):
"""Annotate GSM with provided GPL.
Args:
values (:obj:`str`): Column to use as values eg. "VALUES"
gpl (:obj:`pandas.DataFrame` or :obj:`GEOparse.GPL`): A Platform or
DataFrame to annotate with.
annotation_column (:obj:`str`): Column in table for annotation.
gpl_on (:obj:`str`, optional): Use this column in GPL to merge.
Defaults to "ID".
gsm_on (:obj:`str`, optional): Use this column in GSM to merge.
Defaults to "ID_REF".
Returns:
pandas.DataFrame: Pivoted and annotated table of results
"""
if isinstance(gpl, GPL):
annotation_table = gpl.table
elif isinstance(gpl, DataFrame):
annotation_table = gpl
else:
raise TypeError("gpl should be a GPL object or a pandas.DataFrame")
pivoted_samples = self.pivot_samples(values=values, index=gsm_on)
ndf = pivoted_samples.reset_index().merge(
annotation_table[[gpl_on, annotation_column]],
left_on=gsm_on,
right_on=gpl_on).set_index(gsm_on)
del ndf[gpl_on]
ndf.columns.name = 'name'
return ndf
[docs] def download_supplementary_files(self, directory='series',
download_sra=True, email=None,
sra_kwargs=None, nproc=1):
"""Download supplementary data.
.. warning::
Do not use parallel option (nproc > 1) in the interactive shell.
For more details see `this issue <https://stackoverflow.com/questions/23641475/multiprocessing-working-in-python-but-not-in-ipython/23641560#23641560>`_
on SO.
Args:
directory (:obj:`str`, optional): Directory to download the data
(in this directory function will create new directory with the
files), by default this will be named with the series
name + _Supp.
download_sra (:obj:`bool`, optional): Indicates whether to download
SRA raw data too. Defaults to True.
email (:obj:`str`, optional): E-mail that will be provided to the
Entrez. Defaults to None.
sra_kwargs (:obj:`dict`, optional): Kwargs passed to the
GSM.download_SRA method. Defaults to None.
nproc (:obj:`int`, optional): Number of processes for SRA download
(default is 1, no parallelization).
Returns:
:obj:`dict`: Downloaded data for each of the GSM
"""
if sra_kwargs is None:
sra_kwargs = dict()
if directory == 'series':
dirpath = os.path.abspath(self.get_accession() + "_Supp")
utils.mkdir_p(dirpath)
else:
dirpath = os.path.abspath(directory)
utils.mkdir_p(dirpath)
downloaded_paths = dict()
if nproc == 1:
# No need to parallelize, running ordinary download in loop
downloaded_paths = dict()
for gsm in itervalues(self.gsms):
logger.info(
"Downloading SRA files for %s series\n" % gsm.name)
paths = gsm.download_supplementary_files(email=email,
download_sra=download_sra,
directory=dirpath,
sra_kwargs=sra_kwargs)
downloaded_paths[gsm.name] = paths
elif nproc > 1:
# Parallelization enabled
downloaders = list()
# Collecting params for Pool.map in a loop
for gsm in itervalues(self.gsms):
downloaders.append([
gsm,
download_sra,
email,
dirpath,
sra_kwargs])
p = Pool(nproc)
results = p.map(_supplementary_files_download_worker, downloaders)
downloaded_paths = dict(results)
else:
raise ValueError("Nproc should be non-negative: %s" % str(nproc))
return downloaded_paths
[docs] def download_SRA(self, email, directory='series', filterby=None, nproc=1,
**kwargs):
"""Download SRA files for each GSM in series.
.. warning::
Do not use parallel option (nproc > 1) in the interactive shell.
For more details see `this issue <https://stackoverflow.com/questions/23641475/multiprocessing-working-in-python-but-not-in-ipython/23641560#23641560>`_
on SO.
Args:
email (:obj:`str`): E-mail that will be provided to the Entrez.
directory (:obj:`str`, optional): Directory to save the data
(defaults to the 'series' which saves the data to the directory
with the name of the series + '_SRA' ending).
Defaults to "series".
filterby (:obj:`str`, optional): Filter GSM objects, argument is a
function that operates on GSM object and return bool
eg. lambda x: "brain" not in x.name. Defaults to None.
nproc (:obj:`int`, optional): Number of processes for SRA download
(default is 1, no parallelization).
**kwargs: Any arbitrary argument passed to GSM.download_SRA
method. See the documentation for more details.
Returns:
:obj:`dict`: A dictionary containing output of ``GSM.download_SRA``
method where each GSM accession ID is the key for the
output.
"""
if directory == 'series':
dirpath = os.path.abspath(self.get_accession() + "_SRA")
utils.mkdir_p(dirpath)
else:
dirpath = os.path.abspath(directory)
utils.mkdir_p(dirpath)
if filterby is not None:
gsms_to_use = [gsm for gsm in self.gsms.values() if filterby(gsm)]
else:
gsms_to_use = self.gsms.values()
if nproc == 1:
# No need to parallelize, running ordinary download in loop
downloaded_paths = dict()
for gsm in gsms_to_use:
logger.info(
"Downloading SRA files for %s series\n" % gsm.name)
downloaded_paths[gsm.name] = gsm.download_SRA(
email=email,
directory=dirpath,
**kwargs)
elif nproc > 1:
# Parallelization enabled
downloaders = list()
# Collecting params for Pool.map in a loop
for gsm in gsms_to_use:
downloaders.append([
gsm,
email,
dirpath,
kwargs])
p = Pool(nproc)
results = p.map(_sra_download_worker, downloaders)
downloaded_paths = dict(results)
else:
raise ValueError("Nproc should be non-negative: %s" % str(nproc))
return downloaded_paths
def _get_object_as_soft(self):
"""Get object as SOFT formatted string."""
soft = []
if self.database is not None:
soft.append(self.database._get_object_as_soft())
soft += ["^%s = %s" % (self.geotype, self.name),
self._get_metadata_as_string()]
for gsm in itervalues(self.gsms):
soft.append(gsm._get_object_as_soft())
for gpl in itervalues(self.gpls):
soft.append(gpl._get_object_as_soft())
return "\n".join(soft)
def __str__(self):
return str("<%s: %s - %i SAMPLES, %i PLATFORM(s)>" % (
self.geotype, self.name, len(self.gsms), len(self.gpls)))
def __repr__(self):
return str("<%s: %s - %i SAMPLES, %i PLATFORM(s)>" % (
self.geotype, self.name, len(self.gsms), len(self.gpls)))