Skip to content

Commit e2b3183

Browse files
committed
Merge pull request #142 in SAT/pbmm2 from bugfix/TAG-4740-unnamed-samples to develop
* commit 'e31bc4fd7522c6af2febd9a60f9c5b2e94379275': Label read groups without SM tags, in presence of multiple BioSamples, as 'UnnamedSample'
2 parents dc9cf50 + e31bc4f commit e2b3183

File tree

4 files changed

+161
-16
lines changed

4 files changed

+161
-16
lines changed

src/SampleNames.cpp

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -80,20 +80,15 @@ MovieToSampleToInfix SampleNames::DetermineMovieToSampleToInfix(const UserIO& ui
8080

8181
const auto& md = ds.Metadata();
8282
const auto& biosamples = md.BioSamples();
83+
const auto biosampleCount = biosamples.Size();
8384
std::string nameFromMetadata;
84-
if (biosamples.Size() > 0) {
85-
if (namedSampleCount == 0) {
86-
throw AbortException(
87-
"<BioSamples> list element is present in dataset XML, but SM tags are "
88-
"missing from BAM header read groups");
89-
}
90-
if (biosamples.Size() > 1) {
91-
PBLOG_WARN << "Found more than 1 biosample, which is not yet supported. Will pick "
92-
"the first!";
93-
}
94-
for (const auto& biosample : biosamples) {
95-
nameFromMetadata = biosample.Name();
96-
break;
85+
if (biosampleCount > 0) {
86+
if (biosampleCount > 1 && namedSampleCount == 0) {
87+
PBLOG_INFO << "Found more than 1 biosample, but read groups lack the SM tag - "
88+
"using 'UnnamedSample'!";
89+
nameFromMetadata = "UnnamedSample";
90+
} else {
91+
nameFromMetadata = biosamples[0].Name();
9792
}
9893
}
9994

tests/cram/biosampleConsensus.t

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,3 @@
4848
*\tSM:testSample\t* (glob)
4949
*\tSM:testSample\t* (glob)
5050

51-
$ $__PBTEST_PBMM2_EXE align $NO_SM_BIOSAMPLES $REF $CRAMTMP/ccs8.bam
52-
*<BioSamples> list element is present in dataset XML, but SM tags are missing* (glob)
53-
[1]
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
<pbds:SubreadSet CreatedAt="2018-09-07T15:36:59.919Z" MetaType="PacBio.DataSet.SubreadSet" Name="HG2_SBv2_ELF15kbA_4pM-Cell2" Tags="subreadset" TimeStampedName="54238-SubreadSetCollection-2018-54-07T16:54:12.346Z" UniqueId="0d045cc0-9929-4975-ac64-84922b1bd17a" Version="4.0.1" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbdm="http://pacificbiosciences.com/PacBioDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" xmlns:pbpn="http://pacificbiosciences.com/PacBioPartNumbers.xsd" xmlns:pbrk="http://pacificbiosciences.com/PacBioReagentKit.xsd" xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioSecondaryDataModel.xsd">
2+
<pbbase:ExternalResources>
3+
<pbbase:ExternalResource Description="Points to the subreads bam file." MetaType="PacBio.SubreadFile.SubreadBamFile" Name="subreads bam" ResourceId="median.bam" TimeStampedName="pacbio_subreadfile_subreadbamfile-180910_083901854" UniqueId="7efab2f0-7bfd-4c5c-9376-c1e7eb6d65e8" Version="4.0.1">
4+
<pbbase:FileIndices>
5+
<pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="median.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-180910_083901854" UniqueId="c6b195fd-84e7-45c3-87bc-7cebe4072881" Version="4.0.1" />
6+
</pbbase:FileIndices>
7+
<pbbase:ExternalResources>
8+
<pbbase:ExternalResource Description="Points to the scraps bam file." MetaType="PacBio.SubreadFile.ScrapsBamFile" Name="scraps bam" ResourceId="median.scraps.bam" TimeStampedName="pacbio_subreadfile_scrapsbamfile-180910_083901865" UniqueId="cc138531-0569-4531-a78c-58020272e2d7" Version="4.0.1">
9+
<pbbase:FileIndices>
10+
<pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="median.scraps.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-180910_083901865" UniqueId="45e33dc1-6ef5-4e9b-b94f-60b43e1f39bd" Version="4.0.1" />
11+
</pbbase:FileIndices>
12+
</pbbase:ExternalResource>
13+
<pbbase:ExternalResource Description="Points to the adapters.fasta file." MetaType="PacBio.SubreadFile.AdapterFastaFile" Name="Adapters FASTA" ResourceId="median.adapters.fasta" TimeStampedName="pacbio_subreadfile_adapterfastafile-180910_083901865" UniqueId="8c503789-931d-4481-8a88-f9b34ec5dfbc" Version="4.0.1" />
14+
<pbbase:ExternalResource Description="Points to the summary sts.xml file." MetaType="PacBio.SubreadFile.ChipStatsFile" Name="Chipstats XML" ResourceId="median.sts.xml" TimeStampedName="pacbio_subreadfile_chipstatsfile-180910_083901865" UniqueId="85e644f8-8ecf-4d81-9588-bc17df7057ae" Version="4.0.1" />
15+
</pbbase:ExternalResources>
16+
</pbbase:ExternalResource>
17+
</pbbase:ExternalResources>
18+
<pbds:DataSetMetadata>
19+
<pbds:TotalLength>30217318427</pbds:TotalLength>
20+
<pbds:NumRecords>2658562</pbds:NumRecords>
21+
<pbsample:BioSamples>
22+
<pbsample:BioSample Name="bc1019--bc1019">
23+
<pbsample:DNABarcodes>
24+
<pbsample:DNABarcode Name="bc1019--bc1019" UniqueId="87bb3511-466f-4640-8f54-ebcf47223294"/>
25+
</pbsample:DNABarcodes>
26+
</pbsample:BioSample>
27+
<pbsample:BioSample Name="bc1018--bc1018">
28+
<pbsample:DNABarcodes>
29+
<pbsample:DNABarcode Name="bc1018--bc1018" UniqueId="7e05ce25-a5be-4378-87bc-7c0df6e8008b"/>
30+
</pbsample:DNABarcodes>
31+
</pbsample:BioSample>
32+
</pbsample:BioSamples>
33+
<Collections xmlns="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd">
34+
<CollectionMetadata CreatedAt="2018-09-07T15:36:59.919Z" ModifiedAt="0001-01-01T00:00:00" UniqueId="0d045cc0-9929-4975-ac64-84922b1bd17a" MetaType="CollectionMetadata" TimeStampedName="54238-CollectionMetadata-2018-54-07T16:54:12.346Z" Status="Ready" InstrumentId="54238" InstrumentName="54238" Context="median">
35+
<InstCtrlVer>6.0.0.45616</InstCtrlVer>
36+
<SigProcVer>6.0.0.45300</SigProcVer>
37+
<RunDetails>
38+
<TimeStampedName>r54238_20180907_165412</TimeStampedName>
39+
<Name>2018-09-07_HG2_r54238</Name>
40+
<CreatedBy>ppeluso</CreatedBy>
41+
<WhenCreated>2018-09-07T15:36:59.919Z</WhenCreated>
42+
<StartedBy>unknown</StartedBy>
43+
<WhenStarted>0001-01-01T00:00:00</WhenStarted>
44+
</RunDetails>
45+
<WellSample Name="HG2_SBv2_ELF15kbA_4pM" CreatedAt="2018-09-07T15:36:59.919Z" ModifiedAt="0001-01-01T00:00:00">
46+
<WellName>B01</WellName>
47+
<Concentration>0</Concentration>
48+
<OnPlateLoadingConcentration>0</OnPlateLoadingConcentration>
49+
<InsertSize>15000</InsertSize>
50+
<SampleReuseEnabled>false</SampleReuseEnabled>
51+
<StageHotstartEnabled>false</StageHotstartEnabled>
52+
<SizeSelectionEnabled>false</SizeSelectionEnabled>
53+
<UseCount>0</UseCount>
54+
<BioSamples xmlns="http://pacificbiosciences.com/PacBioSampleInfo.xsd">
55+
<BioSample Name="HG2_SBv2_ELF15kbA_4pM" />
56+
</BioSamples>
57+
</WellSample>
58+
<Automation Name="Workflow_Diffusion.py">
59+
<AutomationParameters xmlns="http://pacificbiosciences.com/PacBioBaseDataModel.xsd">
60+
<AutomationParameter Name="MovieLength" CreatedAt="2018-09-07T15:36:59.919Z" ModifiedAt="0001-01-01T00:00:00" ValueDataType="Double" SimpleValue="1440" />
61+
<AutomationParameter Name="ExtensionTime" CreatedAt="2018-09-07T15:36:59.919Z" ModifiedAt="0001-01-01T00:00:00" ValueDataType="Double" SimpleValue="720" />
62+
<AutomationParameter Name="ExtendFirst" CreatedAt="2018-09-07T15:36:59.919Z" ModifiedAt="0001-01-01T00:00:00" ValueDataType="Boolean" SimpleValue="True" />
63+
<AutomationParameter Name="ReuseCell" CreatedAt="2018-09-07T15:36:59.919Z" ModifiedAt="0001-01-01T00:00:00" ValueDataType="Boolean" SimpleValue="False" />
64+
<AutomationParameter Name="ImmobilizationTime" ValueDataType="Double" SimpleValue="120.0" />
65+
<AutomationParameter Name="CellNFCIndex" ValueDataType="Int32" SimpleValue="1" />
66+
<AutomationParameter Name="ExtraIMWashes" ValueDataType="Int32" SimpleValue="2" />
67+
<AutomationParameter Name="Exposure" ValueDataType="Double" SimpleValue="0.01" />
68+
<AutomationParameter Name="PCDinPlate" ValueDataType="Boolean" SimpleValue="True" />
69+
<AutomationParameter Name="PreExtensionWorkflow" ValueDataType="Boolean" SimpleValue="True" />
70+
<AutomationParameter Name="CollectionNumber" ValueDataType="Int32" SimpleValue="1" />
71+
<AutomationParameter Name="UseStageHotStart" ValueDataType="Boolean" SimpleValue="False" />
72+
<AutomationParameter Name="InsertSize" ValueDataType="Int32" SimpleValue="15000" />
73+
<AutomationParameter Name="HasN2Switch" ValueDataType="Boolean" SimpleValue="True" />
74+
<AutomationParameter Name="TipSearchMaxDuration" ValueDataType="Int32" SimpleValue="576" />
75+
<AutomationParameter Name="SNRCut" ValueDataType="Double" SimpleValue="3.75" />
76+
</AutomationParameters>
77+
</Automation>
78+
<CollectionNumber>1</CollectionNumber>
79+
<CellIndex>1</CellIndex>
80+
<SetNumber>0</SetNumber>
81+
<CellPac Name="SMRT® Cell 1M v3 LR (4/Pack)" Description="Individual 4 Pack containing 4 SMRT®Cells each containing 1 million ZMWs" Version="3.0" PartNumber="101-531-001" LotNumber="324237" Barcode="BA243441" ExpirationDate="2019-04-16" MovieTimeGrade="LR">
82+
<ChipLayout xmlns="http://pacificbiosciences.com/PacBioBaseDataModel.xsd">SequEL_4.0_RTO3</ChipLayout>
83+
</CellPac>
84+
<TemplatePrepKit Name="SMRTbell® Template Prep Kit 1.0" Description="The SMRTbell® Template Prep Kit contains reagent supplies to perform SMRTbell library preparations of primer-annealed SMRTbell libraries for insert sizes ranging from 500 bp to over 20 kb." Tags="Template Prep Kit, TPK" Version="1.0" PartNumber="100-259-100" LotNumber="DM1234" Barcode="DM1234100259100123120" ExpirationDate="2020-12-31" MinInsertSize="500" MaxInsertSize="20000">
85+
<LeftAdaptorSequence xmlns="http://pacificbiosciences.com/PacBioBaseDataModel.xsd">ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT</LeftAdaptorSequence>
86+
<LeftPrimerSequence xmlns="http://pacificbiosciences.com/PacBioBaseDataModel.xsd">aacggaggaggagga</LeftPrimerSequence>
87+
<RightAdaptorSequence xmlns="http://pacificbiosciences.com/PacBioBaseDataModel.xsd">ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT</RightAdaptorSequence>
88+
<RightPrimerSequence xmlns="http://pacificbiosciences.com/PacBioBaseDataModel.xsd">aacggaggaggagga</RightPrimerSequence>
89+
</TemplatePrepKit>
90+
<BindingKit Name="Sequel® Binding Kit 3.0" Description="The Sequel Binding Kit 3.0 contains reagent supplies to bind prepared DNA template libraries to the Sequel Polymerase 3.0 in preparation for sequencing on the Sequel System. The result is a DNA polymerase/template complex. Sequel Binding Kit 3.0 should be used only with Sequel Sequencing Kit 3.0. Reagent quantities support 24 binding reactions." Tags="Binding Kit, BDK" Version="3.0" PartNumber="101-500-400" LotNumber="DM1234" Barcode="DM1234101500400123120" ExpirationDate="2020-12-31" ChipType="1mChip" />
91+
<SequencingKitPlate Name="Sequel® Sequencing Plate 3.0 (4 rxn)" Description="The DNA Sequencing Kit contains a sequencing reagent plate with chemistry for single molecule real-time sequencing on the PacBio Sequel®. Reagent quantities support 4 sequencing reactions to be used in conjunction with SMRT® Cell 4Pac(s). (4 Cells max/Each Row supplies reagents for 1 Sequel SMRT Cell)" Tags="Sequencing Kit, SQK" Version="3.0" PartNumber="101-427-800" LotNumber="013161" Barcode="013161101427800042619" ExpirationDate="2019-04-26" ChipType="1mChip" MaxCollections="4" NumOseTubes="0">
92+
<ReagentTubes Name="Sequel® SMRT®Cell Oil" PartNumber="100-619-600" LotNumber="012712" Barcode="012712100619600033122" ExpirationDate="2022-03-31" xmlns="http://pacificbiosciences.com/PacBioReagentKit.xsd" />
93+
</SequencingKitPlate>
94+
<Primary>
95+
<AutomationName>SequelAlpha</AutomationName>
96+
<ConfigFileName>SqlPoC_SubCrf_2C2A-t2.xml</ConfigFileName>
97+
<SequencingCondition>DefaultPrimarySequencingCondition</SequencingCondition>
98+
<OutputOptions>
99+
<ResultsFolder>314/PPeluso/HG2_15kb/r54238_20180907_165412/2_B01/</ResultsFolder>
100+
<CollectionPathUri>/pbi/collections/314/PPeluso/HG2_15kb/r54238_20180907_165412/2_B01/</CollectionPathUri>
101+
<CopyFiles>
102+
<CollectionFileCopy>Fasta</CollectionFileCopy>
103+
<CollectionFileCopy>Bam</CollectionFileCopy>
104+
</CopyFiles>
105+
<Readout>Bases_Without_QVs</Readout>
106+
<MetricsVerbosity>Minimal</MetricsVerbosity>
107+
<TransferResource>
108+
<Id>rsync-pbi-collections</Id>
109+
<TransferScheme>RSYNC</TransferScheme>
110+
<Name>PBI Collections Rsync</Name>
111+
<Description>Location for writing Transfer services to write to. For testing, Internal tools (PA SIM and ICS) tests must explicitly set the relative path prefix to 'xfer-test'</Description>
112+
<DestPath>/pbi/collections</DestPath>
113+
</TransferResource>
114+
</OutputOptions>
115+
</Primary>
116+
<Secondary>
117+
<AutomationName>DefaultSecondaryAutomationName</AutomationName>
118+
<AutomationParameters>
119+
<AutomationParameter Name="Reference" CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" ValueDataType="String" SimpleValue="DefaultSecondaryAnalysisReferenceName" />
120+
</AutomationParameters>
121+
<CellCountInJob>0</CellCountInJob>
122+
</Secondary>
123+
<UserDefinedFields>
124+
<DataEntities Name=" LIMS_IMPORT " ValueDataType="String" SimpleValue="DefaultUserDefinedFieldLIMS" xmlns="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" />
125+
</UserDefinedFields>
126+
<ComponentVersions>
127+
<VersionInfo Name="ics" Version="6.0.0.45616" />
128+
<VersionInfo Name="iui" Version="6.0.0.45616" />
129+
<VersionInfo Name="chemistry" Version="6.0.0.45111" />
130+
<VersionInfo Name="pa" Version="6.0.0.45300" />
131+
<VersionInfo Name="paws" Version="6.0.0.45300" />
132+
<VersionInfo Name="ppa" Version="6.0.0.45300" />
133+
<VersionInfo Name="realtime" Version="6.0.0.45300" />
134+
<VersionInfo Name="transfer" Version="6.0.0.45300" />
135+
<VersionInfo Name="smrtlink-analysisservices-gui" Version="6.0.0.45618" />
136+
<VersionInfo Name="smrtimisc" Version="6.0.0.45621" />
137+
<VersionInfo Name="smrtlink" Version="6.0.0.45621" />
138+
<VersionInfo Name="smrttools" Version="6.0.0.45580" />
139+
<VersionInfo Name="smrtinub" Version="6.0.0.45580" />
140+
<VersionInfo Name="smrtview" Version="6.0.0.45580" />
141+
</ComponentVersions>
142+
</CollectionMetadata>
143+
</Collections>
144+
</pbds:DataSetMetadata>
145+
</pbds:SubreadSet>
146+

tests/cram/splitsample.t

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
$ MERGED=$TESTDIR/data/merged.dataset.xml
22
$ REF=$TESTDIR/data/ecoliK12_pbi_March2013.fasta
3+
$ NO_SM_BIOSAMPLES=$TESTDIR/data/no_sm_biosamples.subreadset.xml
34

45
$ $__PBTEST_PBMM2_EXE align $MERGED $REF $CRAMTMP/split.bam --split-by-sample
56

@@ -57,3 +58,9 @@ When both --split-by-sample and --sample were set, expect to see only one bam fi
5758
$ [[ -f $CRAMTMP/splitsampleoverride.bam ]] || echo "File does not exist!"
5859
$ samtools view -H $CRAMTMP/splitsampleoverride.bam | grep "@RG" | grep -vP "@PG\tID:samtools" | cut -f 6 | sort | uniq
5960
SM:MySample
61+
62+
$ $__PBTEST_PBMM2_EXE align --split-by-sample $NO_SM_BIOSAMPLES $REF $CRAMTMP/split-no-sm.bam
63+
$ samtools view -H $CRAMTMP/split-no-sm.UnnamedSample.bam | grep -c "@RG"
64+
1
65+
$ samtools view -H $CRAMTMP/split-no-sm.UnnamedSample.bam | grep -c "SM:UnnamedSample"
66+
1

0 commit comments

Comments
 (0)