5757LEGACY_MS_LABELS = "legacy_ms"
5858
5959
60+ @dataclass
61+ class VcfModelMapping :
62+ individuals_nodes : np .ndarray
63+ individuals_name : np .ndarray
64+
65+
6066class CoalescenceRecord (NamedTuple ):
6167 left : float
6268 right : float
@@ -10520,7 +10526,13 @@ def sample_nodes_by_ploidy(self, ploidy):
1052010526 sample_node_ids = sample_node_ids .reshape ((num_samples_per_individual , ploidy ))
1052110527 return sample_node_ids
1052210528
10523- def map_samples_to_vcf (self , individuals = None , ploidy = None ):
10529+ def map_to_vcf_model (
10530+ self ,
10531+ individuals = None ,
10532+ ploidy = None ,
10533+ name_metadata_key = None ,
10534+ individual_names = None ,
10535+ ):
1052410536 """
1052510537 Returns a list of lists of node IDs, where each sublist contains the
1052610538 sample nodes associated with the same individual.
@@ -10529,6 +10541,11 @@ def map_samples_to_vcf(self, individuals=None, ploidy=None):
1052910541 only the nodes for the specified individuals are returned.
1053010542 """
1053110543
10544+ if name_metadata_key is not None and individual_names is not None :
10545+ raise ValueError (
10546+ "Cannot specify both name_metadata_key and individual_names"
10547+ )
10548+
1053210549 if self .num_individuals > 0 and ploidy is not None :
1053310550 raise ValueError (
1053410551 "Cannot specify ploidy when individuals are present in the tree sequence"
@@ -10548,47 +10565,66 @@ def map_samples_to_vcf(self, individuals=None, ploidy=None):
1054810565 if self .num_individuals == 0 and individuals is None :
1054910566 if ploidy is None :
1055010567 ploidy = 1
10551- return self .sample_nodes_by_ploidy (ploidy )
10552-
10553- individuals_nodes = []
10554- if individuals is None :
10555- for ind in self .individuals ():
10556- if len (ind .nodes ) == 0 :
10557- warnings .warn (
10558- f"Individual { ind .id } has no nodes associated with it." ,
10559- stacklevel = 1 ,
10568+ individuals_nodes = self .sample_nodes_by_ploidy (ploidy )
10569+ else :
10570+ individuals_nodes = []
10571+ ts_individual_names = []
10572+ if individuals is None :
10573+ for ind in self .individuals ():
10574+ if len (ind .nodes ) == 0 :
10575+ warnings .warn (
10576+ f"Individual { ind .id } has no nodes associated with it." ,
10577+ stacklevel = 1 ,
10578+ )
10579+ continue
10580+ is_sample = np .array (
10581+ [self .node_flags (u ) & tskit .NODE_IS_SAMPLE for u in ind .nodes ]
1056010582 )
10561- continue
10562- is_sample = np .array (
10563- [self .node_flags (u ) & tskit .NODE_IS_SAMPLE for u in ind .nodes ]
10564- )
10565- if all (is_sample ):
10583+ if all (is_sample ):
10584+ individuals_nodes .append (ind .nodes )
10585+ if name_metadata_key is not None :
10586+ ts_individual_names .append (ind .metadata [name_metadata_key ])
10587+ else :
10588+ ts_individual_names .append (f"tsk_{ ind .id } " )
10589+ elif all (~ is_sample ):
10590+ continue
10591+ else :
10592+ warnings .warn (
10593+ f"Individual { ind .id } has both sample and non-sample nodes "
10594+ "associated with it." ,
10595+ stacklevel = 1 ,
10596+ )
10597+ else :
10598+ for i in individuals :
10599+ if i < 0 or i >= self .num_individuals :
10600+ raise ValueError (f"Invalid individual ID { i } " )
10601+ ind = self .individual (i )
10602+ if len (ind .nodes ) == 0 :
10603+ raise ValueError (
10604+ f"Individual { i } has no nodes associated with it."
10605+ )
1056610606 individuals_nodes .append (ind .nodes )
10567- elif all (~ is_sample ):
10568- continue
10569- else :
10570- warnings .warn (
10571- f"Individual { ind .id } has both sample and non-sample nodes "
10572- "associated with it." ,
10573- stacklevel = 1 ,
10574- )
10575- else :
10576- for i in individuals :
10577- if i < 0 or i >= self .num_individuals :
10578- raise ValueError (f"Invalid individual ID { i } " )
10579- ind = self .individual (i )
10580- if len (ind .nodes ) == 0 :
10581- raise ValueError (f"Individual { i } has no nodes associated with it." )
10582- individuals_nodes .append (ind .nodes )
10583-
10584- if len (individuals_nodes ) == 0 :
10585- return np .array ([], dtype = np .int32 ).reshape ((0 , 0 ))
10586-
10587- max_nodes = max (len (nodes ) for nodes in individuals_nodes )
10588- result = np .full ((len (individuals_nodes ), max_nodes ), - 1 , dtype = np .int32 )
10589- for i , nodes in enumerate (individuals_nodes ):
10590- result [i , : len (nodes )] = nodes
10591- return result
10607+ if name_metadata_key is not None :
10608+ ts_individual_names .append (ind .metadata [name_metadata_key ])
10609+ else :
10610+ ts_individual_names .append (f"tsk_{ ind .id } " )
10611+
10612+ max_nodes = max (len (nodes ) for nodes in individuals_nodes )
10613+ result = np .full ((len (individuals_nodes ), max_nodes ), - 1 , dtype = np .int32 )
10614+ for i , nodes in enumerate (individuals_nodes ):
10615+ result [i , : len (nodes )] = nodes
10616+ individuals_nodes = result
10617+
10618+ if individual_names is None :
10619+ individual_names = ts_individual_names
10620+ individual_names = np .array (individual_names , dtype = object )
10621+
10622+ if len (individuals_nodes ) != len (individual_names ):
10623+ raise ValueError (
10624+ "The number of individuals does not match the number of names"
10625+ )
10626+
10627+ return VcfModelMapping (result , individual_names )
1059210628
1059310629 ############################################
1059410630 #
0 commit comments