--- a +++ b/4-Multi-Omic Integration/scripts/4_metabolome2stitch.pl @@ -0,0 +1,76 @@ +#!bin/perl +## This script was used to extract metabolite-CIDm and CIDm-host target mapping information from STITCH database ## + +open (IN, "chemical.sources.v5.0.tsv"); #### stitch id mapping file #### +while (<IN>) { + chop; + next if (/^#/); + s/CID\S//g; + @a=split("\t",$_); + if (/PC\t(\d+)/) { + $pc{$1}="CIDm".$a[0]; + } + if (/PS\t(\d+)/) { + $ps{$1}="CIDm".$a[0]; + } + $merge{$a[1]}="CIDm".$a[0]; + if (/CHEBI:(\d+)/) { + $chebi{$1}="CIDm".$a[0]; + } + if (/KEGG/) { + $kegg{$a[3]}="CIDm".$a[0]; + } +} + +open (IN1, "metabolite_IDs.txt"); ### metabolite IDs list ### +open (OUT1, ">metabo2CIDm.txt"); ## output ## +while (<IN>) { + chop; + @a=split("\t",$_); + if ($chebi{$a[5]} ne $merge{$a[4]}) { + print STDERR "chebi and pubchem IDs for $a[0] do not match\n"; ### raise a flag if IDs do not match based on different mapping rules + } + else { + print OUT1 $a[0]."\t".$ps{$a[3]}."\t".$merge{$a[3]}."\t".$pc{$a[3]}."\t".$chebi{$a[4]}."\t".$kegg{$a[0]}."\t".$kegg{$a[5]}."\n"; + } +} + +open (IN, "9606.protein_chemical.links.detailed.v5.0.tsv"); +while (<IN>) { + chop; + s/9606\.//g; + @a=split("\t",$_); + next if ($a[6]<700); + #$hash{$a[0]}{$a[1]}=$a[5]; + $score{$a[0]}{$a[1]}=$a[2]."\t".$a[3]."\t".$a[4]."\t".$a[5]."\t".$a[6]; +} +open (IN, "9606.actions.v5.0.tsv"); +while (<IN>) { + chop; + s/9606\.//g; + @a=split("\t",$_); + next if ($a[5]<700); + $int{$a[0]}{$a[1]}=$a[2]; + $int{$a[1]}{$a[0]}=$a[2]; +} + +open (IN, "human_gene_ids.txt"); +while (<IN>) { + chop; + @a=split("\t",$_); + $geneid{$a[2]}=$a[9]."\t".$a[8]; +} + +open (IN2, "all_CIDm.txt"); #### stitch CIDm compound ID list ##### +open (OUT2, ">all_CIDm_targets.txt"); ## output ## +while (<IN>) { + chop; +# @a=split("\t",$_); + $id=$_; + if (exists $int{$id}) { + for my $key (keys %{$int{$id}}) { + print OUT2 $id."\t".$key."\t".$geneid{$key}."\t".$int{$id}{$key}."\t".$score{$id}{$key}."\n"; + } + } +} +