@@ -414,6 +414,13 @@ def test_qnn_backend_conv1d(self):
414414 with self .subTest (i = i ):
415415 self .lower_module_and_test_output (module , sample_input )
416416
417+ def test_qnn_conv1d_batch_norm (self ):
418+ modules = [Conv1dBn (), Conv1dBn (bias = False )] # noqa: F405
419+ sample_input = (torch .randn ([1 , 2048 , 858 ]),)
420+ for i , module in enumerate (modules ):
421+ with self .subTest (i = i ):
422+ self .lower_module_and_test_output (module , sample_input )
423+
417424 def test_qnn_backend_conv2d (self ):
418425 modules = [Conv2dSequential (), Conv2dSequential (bias = False )] # noqa: F405
419426 sample_input = (torch .randn ([1 , 1 , 3 , 3 ]),)
@@ -2829,6 +2836,14 @@ def test_qnn_backend_conv1d(self):
28292836 module = self .get_qdq_module (module , sample_input )
28302837 self .lower_module_and_test_output (module , sample_input )
28312838
2839+ def test_qnn_conv1d_batch_norm (self ):
2840+ modules = [Conv1dBn (), Conv1dBn (bias = False )] # noqa: F405
2841+ sample_input = (torch .randn ([1 , 2048 , 858 ]),)
2842+ for i , module in enumerate (modules ):
2843+ with self .subTest (i = i ):
2844+ module = self .get_qdq_module (module , sample_input )
2845+ self .lower_module_and_test_output (module , sample_input )
2846+
28322847 def test_qnn_backend_conv2d (self ):
28332848 modules = [Conv2dSequential (), Conv2dSequential (bias = False )] # noqa: F405
28342849 sample_input = (torch .randn ([1 , 1 , 3 , 3 ]),)
@@ -7356,13 +7371,30 @@ class MLLMSpecs:
73567371 tok_embedding_pte_size : float
73577372 decoder_pte_size : float
73587373
7374+ @dataclass (frozen = True )
7375+ class ALMSpecs (MLLMSpecs ):
7376+ audio_path : str
7377+ golden_audio_feature : str
7378+
73597379 @dataclass (frozen = True )
73607380 class VLMSpecs (MLLMSpecs ):
73617381 image_path : str
73627382 golden_image_feature : str
73637383
73647384 # TODO: refactor to support different backends
73657385 def setUp (self ):
7386+ self .alm_specs = {
7387+ "granite_speech_3_3-2b" : TestExampleMultimodalityScript .ALMSpecs (
7388+ max_seq_len = 512 ,
7389+ sm8650_token_rate = 5 ,
7390+ sm8750_token_rate = 8 ,
7391+ encoder_pte_size = 900_000_000 , # 900MB
7392+ tok_embedding_pte_size = 240_000_000 , # 240MB
7393+ decoder_pte_size = 3_000_000_000 , # 3GB
7394+ audio_path = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true" , # Audio content: after his nap,...
7395+ golden_audio_feature = "after his nap," ,
7396+ ),
7397+ }
73667398 self .vlm_specs = {
73677399 "smolvlm_500m_instruct" : TestExampleMultimodalityScript .VLMSpecs (
73687400 max_seq_len = 128 ,
@@ -7386,6 +7418,96 @@ def setUp(self):
73867418 ),
73877419 }
73887420
7421+ def test_static_asr (self ):
7422+ if not self .required_envs ([self .model_name ]):
7423+ self .skipTest ("missing required envs" )
7424+
7425+ if self .enable_x86_64 :
7426+ # Running on host is extremely slow for large models, so we skip this check to avoid timeouts.
7427+ # Please verify the output on the actual device instead.
7428+ self .skipTest (
7429+ "Skipping the check for the static ASR model on x86 due to long execution time."
7430+ )
7431+
7432+ alm_specs : TestExampleMultimodalityScript .ALMSpecs = self .alm_specs [
7433+ self .model_name
7434+ ]
7435+ prompt = "can you transcribe the speech into a written format?"
7436+ audio_path = alm_specs .audio_path
7437+ cmds = [
7438+ "python" ,
7439+ f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
7440+ "--artifact" ,
7441+ self .artifact_dir ,
7442+ "--build_folder" ,
7443+ self .build_folder ,
7444+ "--soc_model" ,
7445+ self .soc_model ,
7446+ "--ip" ,
7447+ self .ip ,
7448+ "--port" ,
7449+ str (self .port ),
7450+ "--prompt" ,
7451+ prompt ,
7452+ "--audio_path" ,
7453+ audio_path ,
7454+ "--temperature" ,
7455+ "0" ,
7456+ "--decoder_model" ,
7457+ f"{ self .model_name } " ,
7458+ "--model_mode" ,
7459+ "kv" ,
7460+ "--max_seq_len" ,
7461+ f"{ alm_specs .max_seq_len } " ,
7462+ ]
7463+ if self .compile_only :
7464+ cmds .extend (["--compile_only" ])
7465+ elif self .device :
7466+ cmds .extend (["--device" , self .device ])
7467+ if self .host :
7468+ cmds .extend (["--host" , self .host ])
7469+ if self .pre_gen_pte :
7470+ cmds .extend (["--pre_gen_pte" , self .pre_gen_pte ])
7471+
7472+ p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
7473+ with Listener ((self .ip , self .port )) as listener :
7474+ conn = listener .accept ()
7475+ p .communicate ()
7476+ msg = json .loads (conn .recv ())
7477+ if "Error" in msg :
7478+ self .fail (msg ["Error" ])
7479+ else :
7480+ if not self .compile_only :
7481+ model_out = msg ["result" ][0 ]
7482+ self .assertTrue (
7483+ alm_specs .golden_audio_feature in model_out .lower (),
7484+ f"Expected Output contains feature: '{ alm_specs .golden_audio_feature } ' Actual Output: '{ model_out } '" ,
7485+ )
7486+ print (f"Audio Path: { audio_path } " )
7487+ print (f"Query: { prompt } " )
7488+ print (f"Answer: { model_out } " )
7489+
7490+ encoder_pte_size = msg ["audio_encoder_pte_size" ]
7491+ tok_embedding_pte_size = msg ["tok_embedding_pte_size" ]
7492+ decoder_pte_size = msg ["pte_size" ]
7493+ self .assertLessEqual (encoder_pte_size , alm_specs .encoder_pte_size )
7494+ self .assertLessEqual (
7495+ tok_embedding_pte_size , alm_specs .tok_embedding_pte_size
7496+ )
7497+ self .assertLessEqual (decoder_pte_size , alm_specs .decoder_pte_size )
7498+ print (f"Encoder PTE Size: { encoder_pte_size } bytes" )
7499+ print (f"Token Embedding PTE Size: { tok_embedding_pte_size } bytes" )
7500+ print (f"Text Decoder PTE Size: { decoder_pte_size } bytes" )
7501+
7502+ attr_name = f"{ self .soc_model .lower ()} _token_rate"
7503+ if not self .compile_only and hasattr (alm_specs , attr_name ):
7504+ device_inference_speed = msg ["inference_speed" ]
7505+ expected_inference_speed = getattr (alm_specs , attr_name )
7506+ print (f"Prompt Evaluation: { device_inference_speed } tokens/second" )
7507+ self .assertGreaterEqual (
7508+ device_inference_speed , expected_inference_speed
7509+ )
7510+
73897511 def test_static_vlm (self ):
73907512 if not self .required_envs ([self .model_name ]):
73917513 self .skipTest ("missing required envs" )
@@ -7450,7 +7572,7 @@ def test_static_vlm(self):
74507572 print (f"Query: { prompt } " )
74517573 print (f"Answer: { model_out } " )
74527574 if not self .enable_x86_64 :
7453- encoder_pte_size = msg ["encoder_pte_size " ]
7575+ encoder_pte_size = msg ["vision_encoder_pte_size " ]
74547576 tok_embedding_pte_size = msg ["tok_embedding_pte_size" ]
74557577 decoder_pte_size = msg ["pte_size" ]
74567578 self .assertLessEqual (encoder_pte_size , vlm_specs .encoder_pte_size )
0 commit comments