scMDC / Git / Diff of /src/preprocess.py

Models:
AlyssaS/
scMDC
Downloads: 1
Diff of /src/preprocess.py [000000] .. [ac720d]
Switch to side-by-side view

--- a
+++ b/src/preprocess.py
@@ -0,0 +1,149 @@
+# Copyright 2017 Goekcen Eraslan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pickle, os, numbers
+
+import numpy as np
+import scipy as sp
+import pandas as pd
+import scanpy as sc
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import scale
+import scipy
+
+#TODO: Fix this
+class AnnSequence:
+    def __init__(self, matrix, batch_size, sf=None):
+        self.matrix = matrix
+        if sf is None:
+            self.size_factors = np.ones((self.matrix.shape[0], 1),
+                                        dtype=np.float32)
+        else:
+            self.size_factors = sf
+        self.batch_size = batch_size
+
+    def __len__(self):
+        return len(self.matrix) // self.batch_size
+
+    def __getitem__(self, idx):
+        batch = self.matrix[idx*self.batch_size:(idx+1)*self.batch_size]
+        batch_sf = self.size_factors[idx*self.batch_size:(idx+1)*self.batch_size]
+
+        # return an (X, Y) pair
+        return {'count': batch, 'size_factors': batch_sf}, batch
+
+
+def read_dataset(adata, transpose=False, test_split=False, copy=False):
+
+    if isinstance(adata, sc.AnnData):
+        if copy:
+            adata = adata.copy()
+    elif isinstance(adata, str):
+        adata = sc.read(adata)
+    else:
+        raise NotImplementedError
+
+    norm_error = 'Make sure that the dataset (adata.X) contains unnormalized count data.'
+    assert 'n_count' not in adata.obs, norm_error
+
+    if adata.X.size < 50e6: # check if adata.X is integer only if array is small
+        if sp.sparse.issparse(adata.X):
+            assert (adata.X.astype(int) != adata.X).nnz == 0, norm_error
+        else:
+            assert np.all(adata.X.astype(int) == adata.X), norm_error
+
+    if transpose: adata = adata.transpose()
+
+    if test_split:
+        train_idx, test_idx = train_test_split(np.arange(adata.n_obs), test_size=0.1, random_state=42)
+        spl = pd.Series(['train'] * adata.n_obs)
+        spl.iloc[test_idx] = 'test'
+        adata.obs['DCA_split'] = spl.values
+    else:
+        adata.obs['DCA_split'] = 'train'
+
+    adata.obs['DCA_split'] = adata.obs['DCA_split'].astype('category')
+    print('### Autoencoder: Successfully preprocessed {} genes and {} cells.'.format(adata.n_vars, adata.n_obs))
+
+    return adata
+
+def clr_normalize_each_cell(adata):
+    """Normalize count vector for each cell, i.e. for each row of .X"""
+
+    def seurat_clr(x):
+        # TODO: support sparseness
+        s = np.sum(np.log1p(x[x > 0]))
+        exp = np.exp(s / len(x))
+        return np.log1p(x / exp)
+    
+    adata.raw = adata.copy()
+    sc.pp.normalize_per_cell(adata)
+    adata.obs['size_factors'] = adata.obs.n_counts / np.median(adata.obs.n_counts)
+
+    # apply to dense or sparse matrix, along axis. returns dense matrix
+    adata.X = np.apply_along_axis(
+        seurat_clr, 1, (adata.raw.X.A if scipy.sparse.issparse(adata.raw.X) else adata.raw.X)
+    )
+    return adata
+    
+def normalize(adata, filter_min_counts=True, size_factors=True, normalize_input=True, logtrans_input=True):
+
+    if filter_min_counts:
+        sc.pp.filter_genes(adata, min_counts=1)
+        sc.pp.filter_cells(adata, min_counts=1)
+
+    if size_factors or normalize_input or logtrans_input:
+        adata.raw = adata.copy()
+    else:
+        adata.raw = adata
+
+    if size_factors:
+        sc.pp.normalize_per_cell(adata)
+        adata.obs['size_factors'] = adata.obs.n_counts / np.median(adata.obs.n_counts)
+    else:
+        adata.obs['size_factors'] = 1.0
+
+    if logtrans_input:
+        sc.pp.log1p(adata)
+
+    if normalize_input:
+        sc.pp.scale(adata)
+
+    return adata
+
+def read_genelist(filename):
+    genelist = list(set(open(filename, 'rt').read().strip().split('\n')))
+    assert len(genelist) > 0, 'No genes detected in genelist file'
+    print('### Autoencoder: Subset of {} genes will be denoised.'.format(len(genelist)))
+
+    return genelist
+
+def write_text_matrix(matrix, filename, rownames=None, colnames=None, transpose=False):
+    if transpose:
+        matrix = matrix.T
+        rownames, colnames = colnames, rownames
+
+    pd.DataFrame(matrix, index=rownames, columns=colnames).to_csv(filename,
+                                                                  sep='\t',
+                                                                  index=(rownames is not None),
+                                                                  header=(colnames is not None),
+                                                                  float_format='%.6f')
+def read_pickle(inputfile):
+    return pickle.load(open(inputfile, "rb"))