Source code for GEOparse.GEOTypes

"""
Classes that represent different GEO entities
"""

import os
import re
import abc
import gzip
import json
import time
import numpy as np
import platform
from pandas import DataFrame, concat

from multiprocessing import Pool

try:
    from urllib.error import HTTPError
except ImportError:
    from urllib2 import HTTPError
from six import iteritems, itervalues

from . import utils
from .sra_downloader import SRADownloader
from .logger import geoparse_logger as logger


def _sra_download_worker(*args):
    """A worker to download SRA files.

    To be used with multiprocessing.
    """
    gsm = args[0][0]
    email = args[0][1]
    dirpath = args[0][2]
    kwargs = args[0][3]
    return (gsm.get_accession(), gsm.download_SRA(email, dirpath, **kwargs))


def _supplementary_files_download_worker(*args):
    """A worker to download supplementary files.

    To be used with multiprocessing.
    """
    gsm = args[0][0]
    download_sra = args[0][1]
    email = args[0][2]
    dirpath = args[0][3]
    sra_kwargs = args[0][4]
    return (gsm.get_accession(), gsm.download_supplementary_files(
        directory=dirpath,
        download_sra=download_sra,
        email=email, **sra_kwargs))


[docs]class DataIncompatibilityException(Exception): pass
[docs]class NoMetadataException(Exception): pass
[docs]class BaseGEO(object): __metaclass__ = abc.ABCMeta geotype = None def __init__(self, name, metadata): """Initialize base GEO object. Args: name (:obj:`str`): Name of the object. metadata (:obj:`dict`): Metadata information. Raises: TypeError: Metadata should be a dict. """ if not isinstance(metadata, dict): raise TypeError("Metadata should be a dictionary not a %s" % str( type(metadata))) self.name = name self.metadata = metadata self.relations = {} if 'relation' in self.metadata: for relation in self.metadata['relation']: tmp = re.split(r':\s+', relation) relname = tmp[0] relval = tmp[1] if relname in self.relations: self.relations[relname].append(relval) else: self.relations[relname] = [relval]
[docs] def get_metadata_attribute(self, metaname): """Get the metadata attribute by the name. Args: metaname (:obj:`str`): Name of the attribute Returns: :obj:`list` or :obj:`str`: Value(s) of the requested metadata attribute Raises: NoMetadataException: Attribute error TypeError: Metadata should be a list """ metadata_value = self.metadata.get(metaname, None) if metadata_value is None: raise NoMetadataException( "No metadata attribute named %s" % metaname) if not isinstance(metadata_value, list): raise TypeError("Metadata is not a list and it should be.") if len(metadata_value) > 1: return metadata_value else: return metadata_value[0]
[docs] def get_accession(self): """Return accession ID of the sample. Returns: :obj:`str`: GEO accession ID """ return self.get_metadata_attribute("geo_accession")
[docs] def get_type(self): """Get the type of the GEO. Returns: :obj:`str`: Type attribute of the GEO """ try: return self.get_metadata_attribute("type") except NoMetadataException: return None
def _get_metadata_as_string(self): """Get the metadata as SOFT formatted string.""" metalist = [] for metaname, meta in iteritems(self.metadata): message = "Single value in metadata dictionary should be a list!" assert isinstance(meta, list), message for data in meta: if data: metalist.append("!%s_%s = %s" % (self.geotype.capitalize(), metaname, data)) return "\n".join(metalist)
[docs] def show_metadata(self): """Print metadata in SOFT format.""" print(self._get_metadata_as_string())
[docs] def to_soft(self, path_or_handle, as_gzip=False): """Save the object in a SOFT format. Args: path_or_handle (:obj:`str` or :obj:`file`): Path or handle to output file as_gzip (:obj:`bool`): Save as gzip """ if isinstance(path_or_handle, str): if as_gzip: with gzip.open(path_or_handle, 'wt') as outfile: outfile.write(self._get_object_as_soft()) else: with open(path_or_handle, 'w') as outfile: outfile.write(self._get_object_as_soft()) else: path_or_handle.write(self._get_object_as_soft())
@abc.abstractmethod def _get_object_as_soft(self): """Get the object as SOFT formatted string.""" raise NotImplementedError("Method not implemented") def __str__(self): return str("<%s: %s>" % (self.geotype, self.name)) def __repr__(self): return str("<%s: %s>" % (self.geotype, self.name))
[docs]class SimpleGEO(BaseGEO): __metaclass__ = abc.ABCMeta def __init__(self, name, metadata, table, columns): """Initialize simple GEO object. Args: name (:obj:`str`): Name of the object metadata (:obj:`dict`): Metadata information table (:obj:`pandas.DataFrame`): Table with the data from SOFT file columns (:obj:`pandas.DataFrame`): Description of the columns, number of columns, order and names represented as index in this DataFrame has to be the same as table.columns. Raises: :obj:`ValueError`: Table should be a DataFrame :obj:`ValueError`: Columns' description should be a DataFrame :obj:`DataIncompatibilityException`: Columns are wrong :obj:`ValueError`: Description has to be present in columns """ if not isinstance(table, DataFrame): raise ValueError(("Table data should be an instance of " "pandas.DataFrame not %s") % str(type(table))) if not isinstance(columns, DataFrame): raise ValueError(("Columns description should be an instance of " "pandas.DataFrame not %s") % str(type(columns))) BaseGEO.__init__(self, name=name, metadata=metadata) self.table = table self.columns = columns columns_are_correct = False if self.columns.index.tolist() != self.table.columns.tolist(): if not self.columns.index.is_unique: # try to correct duplicate index in the same way pandas is # doing the columns logger.warning( "Detected duplicated columns in %s %s. Correcting.\n" % ( self.geotype, self.name)) indices = {} new_index = [] for idx in self.columns.index: if idx not in indices: indices[idx] = 0 new_index.append(idx) else: indices[idx] += 1 new_index.append("%s.%i" % (idx, indices[idx])) self.columns.index = new_index if self.columns.index.tolist() == self.table.columns.tolist(): columns_are_correct = True if not columns_are_correct: # if the columns are still not correct check the order. if sorted(self.columns.index.tolist()) == sorted(self.table.columns.tolist()): logger.warning( "Data columns in %s %s are not in order. Reordering.\n" % ( self.geotype, self.name)) self.columns = self.columns.loc[self.table.columns] if self.columns.index.tolist() == self.table.columns.tolist(): columns_are_correct = True else: columns_are_correct = True if not columns_are_correct: rows_in_columns = ", ".join(self.columns.index.tolist()) columns_in_table = ", ".join(self.table.columns.tolist()) raise DataIncompatibilityException( "\nData columns do not match columns description index in %s\n" % ( self.name) + "Columns in table are: %s\n" % columns_in_table + "Index in columns are: %s\n" % rows_in_columns) if self.columns.columns[0] != 'description': raise ValueError(("Columns table must contain a column named" "'description'. Here columns are: %s") % ( ", ".join(map(str, self.columns.columns))))
[docs] def head(self): """Print short description of the object.""" summary = list() summary.append("%s %s" % (self.geotype, self.name) + "\n") summary.append(" - Metadata:" + "\n") summary.append( "\n".join(self._get_metadata_as_string().split("\n")[:5]) + "\n") summary.append("\n") summary.append(" - Columns:" + "\n") summary.append(self.columns.to_string() + "\n") summary.append("\n") summary.append(" - Table:" + "\n") summary.append( "\t".join(["Index"] + self.table.columns.tolist()) + "\n") summary.append(self.table.head().to_string(header=None) + "\n") summary.append(" " * 40 + "..." + " " * 40 + "\n") summary.append(" " * 40 + "..." + " " * 40 + "\n") summary.append(" " * 40 + "..." + " " * 40 + "\n") summary.append(self.table.tail().to_string(header=None) + "\n") return "\n".join([str(s) for s in summary])
[docs] def show_columns(self): """Print columns in SOFT format.""" print(self.columns)
[docs] def show_table(self, number_of_lines=5): """Show few lines of the table the table as pandas.DataFrame. Args: number_of_lines (:obj:`int`): Number of lines to show. Defaults to 5. """ print(self.table.head(number_of_lines))
def _get_object_as_soft(self): """Get the object as SOFT formated string.""" soft = ["^%s = %s" % (self.geotype, self.name), self._get_metadata_as_string(), self._get_columns_as_string(), self._get_table_as_string()] return "\n".join(soft) def _get_table_as_string(self): """Get table as SOFT formated string.""" tablelist = [] tablelist.append("!%s_table_begin" % self.geotype.lower()) tablelist.append("\t".join(self.table.columns)) for idx, row in self.table.iterrows(): tablelist.append("\t".join(map(str, row))) tablelist.append("!%s_table_end" % self.geotype.lower()) return "\n".join(tablelist) def _get_columns_as_string(self): """Returns columns as SOFT formated string.""" columnslist = [] for rowidx, row in self.columns.iterrows(): columnslist.append("#%s = %s" % (rowidx, row.description)) return "\n".join(columnslist)
[docs]class GSM(SimpleGEO): """Class that represents sample from GEO database.""" geotype = 'SAMPLE'
[docs] def annotate(self, gpl, annotation_column, gpl_on="ID", gsm_on="ID_REF", in_place=False): """Annotate GSM with provided GPL Args: gpl (:obj:`pandas.DataFrame`): A Platform or DataFrame to annotate with annotation_column (str`): Column in a table for annotation gpl_on (:obj:`str`): Use this column in GSM to merge. Defaults to "ID". gsm_on (:obj:`str`): Use this column in GPL to merge. Defaults to "ID_REF". in_place (:obj:`bool`): Substitute table in GSM by new annotated table. Defaults to False. Returns: :obj:`pandas.DataFrame` or :obj:`None`: Annotated table or None Raises: TypeError: GPL should be GPL or pandas.DataFrame """ if isinstance(gpl, GPL): annotation_table = gpl.table elif isinstance(gpl, DataFrame): annotation_table = gpl else: raise TypeError("gpl should be a GPL object or a pandas.DataFrame") # annotate by merging annotated = self.table.merge( annotation_table[[gpl_on, annotation_column]], left_on=gsm_on, right_on=gpl_on) del annotated[gpl_on] if in_place: self.table = annotated return None else: return annotated
[docs] def annotate_and_average(self, gpl, expression_column, group_by_column, rename=True, force=False, merge_on_column=None, gsm_on=None, gpl_on=None): """Annotate GSM table with provided GPL. Args: gpl (:obj:`GEOTypes.GPL`): Platform for annotations expression_column (:obj:`str`): Column name which "expressions" are represented group_by_column (:obj:`str`): The data will be grouped and averaged over this column and only this column will be kept rename (:obj:`bool`): Rename output column to the self.name. Defaults to True. force (:obj:`bool`): If the name of the GPL does not match the platform name in GSM proceed anyway. Defaults to False. merge_on_column (:obj:`str`): Column to merge the data on. Defaults to None. gsm_on (:obj:`str`): In the case columns to merge are different in GSM and GPL use this column in GSM. Defaults to None. gpl_on (:obj:`str`): In the case columns to merge are different in GSM and GPL use this column in GPL. Defaults to None. Returns: :obj:`pandas.DataFrame`: Annotated data """ if gpl.name != self.metadata['platform_id'][0] and not force: raise KeyError("Platforms from GSM (%s) and from GPL (%s)" % ( gpl.name, self.metadata['platform_id']) + " are incompatible. Use force=True to use this GPL.") if merge_on_column is None and gpl_on is None and gsm_on is None: raise Exception("You have to provide one of the two: " "merge_on_column or gpl_on and gsm_on parameters") if merge_on_column: logger.info("merge_on_column is not None. Using this option.") tmp_data = self.table.merge(gpl.table, on=merge_on_column, how='outer') tmp_data = tmp_data.groupby(group_by_column).mean()[ [expression_column]] else: if gpl_on is None or gsm_on is None: raise Exception("Please provide both gpl_on and gsm_on or " "provide merge_on_column only") tmp_data = self.table.merge(gpl.table, left_on=gsm_on, right_on=gpl_on, how='outer') tmp_data = tmp_data.groupby(group_by_column).mean()[ [expression_column]] if rename: tmp_data.columns = [self.name] return tmp_data
[docs] def download_supplementary_files(self, directory="./", download_sra=True, email=None, sra_kwargs=None): """Download all supplementary data available for the sample. Args: directory (:obj:`str`): Directory to download the data (in this directory function will create new directory with the files). Defaults to "./". download_sra (:obj:`bool`): Indicates whether to download SRA raw data too. Defaults to True. email (:obj:`str`): E-mail that will be provided to the Entrez. It is mandatory if download_sra=True. Defaults to None. sra_kwargs (:obj:`dict`, optional): Kwargs passed to the download_SRA method. Defaults to None. Returns: :obj:`dict`: A key-value pair of name taken from the metadata and paths downloaded, in the case of SRA files the key is ``SRA``. """ directory_path = os.path.abspath( os.path.join(directory, "%s_%s_%s" % ( 'Supp', self.get_accession(), # the directory name cannot contain many of the signs re.sub(r'[\s\*\?\(\),\.;]', '_', self.metadata['title'][0])))) utils.mkdir_p(os.path.abspath(directory_path)) downloaded_paths = dict() if sra_kwargs is None: sra_kwargs = {} # Possible erroneous values that could be identified and skipped right # after blacklist = ('NONE',) for metakey, metavalue in iteritems(self.metadata): if 'supplementary_file' in metakey: assert len(metavalue) == 1 and metavalue != '' if metavalue[0] in blacklist: logger.warning("%s value is blacklisted as '%s' - skipping" % (metakey, metavalue[0])) continue # SRA will be downloaded elsewhere if 'sra' not in metavalue[0]: download_path = os.path.abspath(os.path.join( directory, os.path.join(directory_path, metavalue[0].split("/")[-1]))) try: utils.download_from_url(metavalue[0], download_path) downloaded_paths[metavalue[0]] = download_path except Exception as err: logger.error( "Cannot download %s supplementary file (%s)" % ( self.get_accession(), err)) if download_sra: try: downloaded_files = self.download_SRA( email, directory=directory, **sra_kwargs) downloaded_paths.update(downloaded_files) except Exception as err: logger.error("Cannot download %s SRA file (%s)" % ( self.get_accession(), err)) return downloaded_paths
[docs] def download_SRA(self, email, directory='./', **kwargs): """Download RAW data as SRA file. The files will be downloaded to the sample directory created ad hoc or the directory specified by the parameter. The sample has to come from sequencing eg. mRNA-seq, CLIP etc. An important parameter is a filetype. By default an SRA is accessed by FTP and such file is downloaded. This does not require additional libraries. However in order to produce FASTA of FASTQ files one would need to use SRA-Toolkit. Thus, it is assumed that this library is already installed or it will be installed in the near future. One can immediately specify the download type to fasta or fastq. To see all possible ``**kwargs`` that could be passed to the function see the description of :class:`~GEOparse.sra_downloader.SRADownloader`. Args: email (:obj:`str`): an email (any) - Required by NCBI for access directory (:obj:`str`, optional): The directory to which download the data. Defaults to "./". **kwargs: Arbitrary keyword arguments, see description Returns: :obj:`dict`: A dictionary containing only one key (``SRA``) with the list of downloaded files. Raises: :obj:`TypeError`: Type to download unknown :obj:`NoSRARelationException`: No SRAToolkit :obj:`Exception`: Wrong e-mail :obj:`HTTPError`: Cannot access or connect to DB """ downloader = SRADownloader(self, email, directory, **kwargs) return {"SRA": downloader.download()}
[docs]class GPL(SimpleGEO): """Class that represents platform from GEO database""" geotype = "PLATFORM" def __init__(self, name, metadata, table=None, columns=None, gses=None, gsms=None, database=None): """Initialize GPL. Args: name (:obj:`str`): Name of the object metadata (:obj:`dict`): Metadata information table (:obj:`pandas.DataFrame`, optional): Table with actual GPL data columns (:obj:`pandas.DataFrame`, optional): Table with description of the columns. Defaults to None. gses (:obj:`dict` of :obj:`GEOparse.GSE`, optional): A dictionary of GSE objects. Defaults to None. gsms (:obj:`dict` of :obj:`GEOparse.GSM`, optional): A dictionary of GSM objects. Defaults to None. database (:obj:`GEOparse.GEODatabase`, optional): A database object from SOFT file associated with GPL. Defaults to None. """ gses = {} if gses is None else gses if not isinstance(gses, dict): raise ValueError( "GSEs should be a dictionary not a %s" % str(type(gses))) gsms = {} if gsms is None else gsms if not isinstance(gsms, dict): raise ValueError( "GSMs should be a dictionary not a %s" % str(type(gsms))) for gsm_name, gsm in iteritems(gsms): assert isinstance(gsm, GSM), "All GSMs should be of type GSM" for gse_name, gse in iteritems(gses): assert isinstance(gse, GSE), "All GSEs should be of type GSE" if database is not None: if not isinstance(database, GEODatabase): raise ValueError( "Database should be a GEODatabase not a %s" % str( type(database))) table = DataFrame() if table is None else table columns = DataFrame() if columns is None else columns SimpleGEO.__init__(self, name=name, metadata=metadata, table=table, columns=columns) self.gses = gses self.gsms = gsms self.database = database
[docs]class GDSSubset(BaseGEO): """Class that represents a subset from GEO GDS object.""" geotype = "SUBSET" def _get_object_as_soft(self): """Get the object as SOFT formatted string.""" soft = ["^%s = %s" % (self.geotype, self.name), self._get_metadata_as_string()] return "\n".join(soft)
[docs]class GEODatabase(BaseGEO): """Class that represents a subset from GEO GDS object.""" geotype = "DATABASE" def _get_object_as_soft(self): """Return object as SOFT formatted string.""" soft = ["^%s = %s" % (self.geotype, self.name), self._get_metadata_as_string()] return "\n".join(soft)
[docs]class GDS(SimpleGEO): """Class that represents a dataset from GEO database""" geotype = "DATASET" def __init__(self, name, metadata, table, columns, subsets, database=None): """Initialize GDS Args: name (:obj:`str`): Name of the object. metadata (:obj:`dict`): Metadata information. table (:obj:`pandas.DataFrame`): Table with the data from SOFT file. columns (:obj:`pandas.DataFrame`): description of the columns, number of columns, order, and names represented as index in this DataFrame has to be the same as table.columns. subsets (:obj:`dict` of :obj:`GEOparse.GDSSubset`): GDSSubset from GDS soft file. database (:obj:`GEOparse.Database`, optional): Database from SOFT file. Defaults to None. """ if not isinstance(subsets, dict): raise ValueError( "Subsets should be a dictionary not a %s" % str(type(subsets))) if database is not None: if not isinstance(database, GEODatabase): raise ValueError( "Database should be a GEODatabase not a %s" % str( type(database))) SimpleGEO.__init__(self, name=name, metadata=metadata, table=table, columns=columns) # effectively deletes the columns with ID_REF self.columns = self.columns.dropna() self.subsets = subsets self.database = database for subset_name, subset in iteritems(subsets): message = "All subsets should be of type GDSSubset" assert isinstance(subset, GDSSubset), message def _get_object_as_soft(self): """Return object as SOFT formatted string.""" soft = [] if self.database is not None: soft.append(self.database._get_object_as_soft()) soft += ["^%s = %s" % (self.geotype, self.name), self._get_metadata_as_string()] for subset in self.subsets.values(): soft.append(subset._get_object_as_soft()) soft += ["^%s = %s" % (self.geotype, self.name), self._get_columns_as_string(), self._get_table_as_string()] return "\n".join(soft)
[docs]class GSE(BaseGEO): """Class representing GEO series""" geotype = "SERIES" def __init__(self, name, metadata, gpls=None, gsms=None, database=None): """Initialize GSE. Args: name (:obj:`str`): Name of the object. metadata (:obj:`dict`): Metadata information. gpls (:obj:`dict` of :obj:`GEOparse.GPL`, optional): A dictionary of GSE objects. Defaults to None. gsms (:obj:`dict` of :obj:`GEOparse.GSM`, optional): A dictionary of GSM objects. Defaults to None. database (:obj:`GEOparse.Database`, optional): Database from SOFT file. Defaults to None. """ gpls = {} if gpls is None else gpls if not isinstance(gpls, dict): raise ValueError( "GPLs should be a dictionary not a %s" % str(type(gpls))) gsms = {} if gsms is None else gsms if not isinstance(gsms, dict): raise ValueError( "GSMs should be a dictionary not a %s" % str(type(gsms))) for gsm_name, gsm in iteritems(gsms): assert isinstance(gsm, GSM), "All GSMs should be of type GSM" for gpl_name, gpl in iteritems(gpls): assert isinstance(gpl, GPL), "All GPLs should be of type GPL" if database is not None: if not isinstance(database, GEODatabase): raise ValueError( "Database should be a GEODatabase not a %s" % str( type(database))) BaseGEO.__init__(self, name=name, metadata=metadata) self.gpls = gpls self.gsms = gsms self.database = database self._phenotype_data = None @property def phenotype_data(self): """Get the phenotype data for each of the sample.""" if self._phenotype_data is None: pheno_data = {} for gsm_name, gsm in iteritems(self.gsms): tmp = {} for key, value in iteritems(gsm.metadata): if len(value) == 0: tmp[key] = np.nan elif key.startswith("characteristics_"): for i, char in enumerate(value): char = re.split(":\s+", char) char_type, char_value = [char[0], ": ".join(char[1:])] tmp[key + "." + str( i) + "." + char_type] = char_value else: tmp[key] = ",".join(value) pheno_data[gsm_name] = tmp self._phenotype_data = DataFrame(pheno_data).T return self._phenotype_data
[docs] def merge_and_average(self, platform, expression_column, group_by_column, force=False, merge_on_column=None, gsm_on=None, gpl_on=None): """Merge and average GSE samples. For given platform prepare the DataFrame with all the samples present in the GSE annotated with given column from platform and averaged over the column. Args: platform (:obj:`str` or :obj:`GEOparse.GPL`): GPL platform to use. expression_column (:obj:`str`): Column name in which "expressions" are represented group_by_column (:obj:`str`): The data will be grouped and averaged over this column and only this column will be kept force (:obj:`bool`): If the name of the GPL does not match the platform name in GSM proceed anyway merge_on_column (:obj:`str`): Column to merge the data on - should be present in both GSM and GPL gsm_on (:obj:`str`): In the case columns to merge are different in GSM and GPL use this column in GSM gpl_on (:obj:`str`): In the case columns to merge are different in GSM and GPL use this column in GPL Returns: :obj:`pandas.DataFrame`: Merged and averaged table of results. """ if isinstance(platform, str): gpl = self.gpls[platform] elif isinstance(platform, GPL): gpl = platform else: raise ValueError("Platform has to be of type GPL or string with " "key for platform in GSE") data = [] for gsm in self.gsms.values(): if gpl.name == gsm.metadata['platform_id'][0]: data.append(gsm.annotate_and_average( gpl=gpl, merge_on_column=merge_on_column, expression_column=expression_column, group_by_column=group_by_column, force=force, gpl_on=gpl_on, gsm_on=gsm_on)) if len(data) == 0: logger.warning("No samples for the platform were found\n") return None elif len(data) == 1: return data[0] else: return data[0].join(data[1:])
[docs] def pivot_samples(self, values, index="ID_REF"): """Pivot samples by specified column. Construct a table in which columns (names) are the samples, index is a specified column eg. ID_REF and values in the columns are of one specified type. Args: values (:obj:`str`): Column name present in all GSMs. index (:obj:`str`, optional): Column name that will become an index in pivoted table. Defaults to "ID_REF". Returns: :obj:`pandas.DataFrame`: Pivoted data """ data = [] for gsm in self.gsms.values(): tmp_data = gsm.table.copy() tmp_data["name"] = gsm.name data.append(tmp_data) ndf = concat(data).pivot(index=index, values=values, columns="name") return ndf
[docs] def pivot_and_annotate(self, values, gpl, annotation_column, gpl_on="ID", gsm_on="ID_REF"): """Annotate GSM with provided GPL. Args: values (:obj:`str`): Column to use as values eg. "VALUES" gpl (:obj:`pandas.DataFrame` or :obj:`GEOparse.GPL`): A Platform or DataFrame to annotate with. annotation_column (:obj:`str`): Column in table for annotation. gpl_on (:obj:`str`, optional): Use this column in GPL to merge. Defaults to "ID". gsm_on (:obj:`str`, optional): Use this column in GSM to merge. Defaults to "ID_REF". Returns: pandas.DataFrame: Pivoted and annotated table of results """ if isinstance(gpl, GPL): annotation_table = gpl.table elif isinstance(gpl, DataFrame): annotation_table = gpl else: raise TypeError("gpl should be a GPL object or a pandas.DataFrame") pivoted_samples = self.pivot_samples(values=values, index=gsm_on) ndf = pivoted_samples.reset_index().merge( annotation_table[[gpl_on, annotation_column]], left_on=gsm_on, right_on=gpl_on).set_index(gsm_on) del ndf[gpl_on] ndf.columns.name = 'name' return ndf
[docs] def download_supplementary_files(self, directory='series', download_sra=True, email=None, sra_kwargs=None, nproc=1): """Download supplementary data. .. warning:: Do not use parallel option (nproc > 1) in the interactive shell. For more details see `this issue <https://stackoverflow.com/questions/23641475/multiprocessing-working-in-python-but-not-in-ipython/23641560#23641560>`_ on SO. Args: directory (:obj:`str`, optional): Directory to download the data (in this directory function will create new directory with the files), by default this will be named with the series name + _Supp. download_sra (:obj:`bool`, optional): Indicates whether to download SRA raw data too. Defaults to True. email (:obj:`str`, optional): E-mail that will be provided to the Entrez. Defaults to None. sra_kwargs (:obj:`dict`, optional): Kwargs passed to the GSM.download_SRA method. Defaults to None. nproc (:obj:`int`, optional): Number of processes for SRA download (default is 1, no parallelization). Returns: :obj:`dict`: Downloaded data for each of the GSM """ if sra_kwargs is None: sra_kwargs = dict() if directory == 'series': dirpath = os.path.abspath(self.get_accession() + "_Supp") utils.mkdir_p(dirpath) else: dirpath = os.path.abspath(directory) utils.mkdir_p(dirpath) downloaded_paths = dict() if nproc == 1: # No need to parallelize, running ordinary download in loop downloaded_paths = dict() for gsm in itervalues(self.gsms): logger.info( "Downloading SRA files for %s series\n" % gsm.name) paths = gsm.download_supplementary_files(email=email, download_sra=download_sra, directory=dirpath, sra_kwargs=sra_kwargs) downloaded_paths[gsm.name] = paths elif nproc > 1: # Parallelization enabled downloaders = list() # Collecting params for Pool.map in a loop for gsm in itervalues(self.gsms): downloaders.append([ gsm, download_sra, email, dirpath, sra_kwargs]) p = Pool(nproc) results = p.map(_supplementary_files_download_worker, downloaders) downloaded_paths = dict(results) else: raise ValueError("Nproc should be non-negative: %s" % str(nproc)) return downloaded_paths
[docs] def download_SRA(self, email, directory='series', filterby=None, nproc=1, **kwargs): """Download SRA files for each GSM in series. .. warning:: Do not use parallel option (nproc > 1) in the interactive shell. For more details see `this issue <https://stackoverflow.com/questions/23641475/multiprocessing-working-in-python-but-not-in-ipython/23641560#23641560>`_ on SO. Args: email (:obj:`str`): E-mail that will be provided to the Entrez. directory (:obj:`str`, optional): Directory to save the data (defaults to the 'series' which saves the data to the directory with the name of the series + '_SRA' ending). Defaults to "series". filterby (:obj:`str`, optional): Filter GSM objects, argument is a function that operates on GSM object and return bool eg. lambda x: "brain" not in x.name. Defaults to None. nproc (:obj:`int`, optional): Number of processes for SRA download (default is 1, no parallelization). **kwargs: Any arbitrary argument passed to GSM.download_SRA method. See the documentation for more details. Returns: :obj:`dict`: A dictionary containing output of ``GSM.download_SRA`` method where each GSM accession ID is the key for the output. """ if directory == 'series': dirpath = os.path.abspath(self.get_accession() + "_SRA") utils.mkdir_p(dirpath) else: dirpath = os.path.abspath(directory) utils.mkdir_p(dirpath) if filterby is not None: gsms_to_use = [gsm for gsm in self.gsms.values() if filterby(gsm)] else: gsms_to_use = self.gsms.values() if nproc == 1: # No need to parallelize, running ordinary download in loop downloaded_paths = dict() for gsm in gsms_to_use: logger.info( "Downloading SRA files for %s series\n" % gsm.name) downloaded_paths[gsm.name] = gsm.download_SRA( email=email, directory=dirpath, **kwargs) elif nproc > 1: # Parallelization enabled downloaders = list() # Collecting params for Pool.map in a loop for gsm in gsms_to_use: downloaders.append([ gsm, email, dirpath, kwargs]) p = Pool(nproc) results = p.map(_sra_download_worker, downloaders) downloaded_paths = dict(results) else: raise ValueError("Nproc should be non-negative: %s" % str(nproc)) return downloaded_paths
def _get_object_as_soft(self): """Get object as SOFT formatted string.""" soft = [] if self.database is not None: soft.append(self.database._get_object_as_soft()) soft += ["^%s = %s" % (self.geotype, self.name), self._get_metadata_as_string()] for gsm in itervalues(self.gsms): soft.append(gsm._get_object_as_soft()) for gpl in itervalues(self.gpls): soft.append(gpl._get_object_as_soft()) return "\n".join(soft) def __str__(self): return str("<%s: %s - %i SAMPLES, %i PLATFORM(s)>" % ( self.geotype, self.name, len(self.gsms), len(self.gpls))) def __repr__(self): return str("<%s: %s - %i SAMPLES, %i PLATFORM(s)>" % ( self.geotype, self.name, len(self.gsms), len(self.gpls)))