Skip to content

Raw data processing notebook to Python files#4

Open
DhmhtrhsPakakis wants to merge 13 commits into
mainfrom
pipeline-scripts
Open

Raw data processing notebook to Python files#4
DhmhtrhsPakakis wants to merge 13 commits into
mainfrom
pipeline-scripts

Conversation

@DhmhtrhsPakakis

Copy link
Copy Markdown
Contributor

From the Hydrogen_Preprocessing_Custom.ipynb i created python files in the folder raw_data_pipeline_tools and a main.py to use the files and the pipeline.

This pipeline has an input of the raw data from the observation and as an output the calibrated/ or only-target spectrum.

Note: The output of this pipeline, the calibrated signal, will be the input for the main data analysis pipelines (HI line extraction,background removal etc). In this version the calibrated signal is not returned, just used in main for plots. If this cant change afterwards, so it can be returned then don't accept the PR.

Copilot AI left a comment

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull request overview

This PR converts a raw-data preprocessing notebook into importable Python modules plus a main.py runner intended to ingest ON/OFF observations, optionally calibrate (ON/OFF), and visualize the resulting spectra.

Changes:

  • Added .dat.csv conversion utility (dat_to_csv).
  • Added a helper to average a time series into an FFT-sized spectrum (get_avg_signal).
  • Added main.py to orchestrate loading, optional calibration, and plotting.

Reviewed changes

Copilot reviewed 3 out of 4 changed files in this pull request and generated 12 comments.

File Description
raw_data_pipeline_tools/data_convert_to_csv.py Adds a utility to convert binary .dat float32 samples into a single-column CSV.
raw_data_pipeline_tools/average_signal_fftsize.py Adds a helper that reshapes samples into fft_size blocks and averages them.
raw_data_pipeline_tools/__init__.py Introduces a package marker for the new tools folder.
main.py Adds a runnable script wiring conversion, averaging, optional calibration, and plotting.

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

"""
Create the average spectrum from the time series.
"""
reshaped = time_series.reshape(-1, fft_size)
def get_avg_signal(time_series, fft_size):
"""
Create the average spectrum from the time series.
"""
Comment thread main.py Outdated
Comment on lines +15 to +41
fft_size = 2048

frequencies = np.linspace(1.4205 - 0.003840/2, 1.4205000 + 0.003840/2, fft_size)

on_observation_filename = '2502202_Hot202020.csv'
off_observation_filename = "2502202_Cold202020.csv"

# Files Management
if on_observation_filename.endswith(".dat"):
base_name = on_observation_filename.split('.')[0]
on_observation_filename = f"{base_name}.csv"
dat_to_csv(f"{base_name}.dat", on_observation_filename)

if off_observation_filename.endswith(".dat"):
base_name = off_observation_filename.split('.')[0]
off_observation_filename = f"{base_name}.csv"
dat_to_csv(f"{base_name}.dat", off_observation_filename)

# ON signal proccesing
on_series_df = pd.read_csv(on_observation_filename)
on_series_df = on_series_df.filter(regex='power_au|y_axis')
on_series_np = on_series_df.to_numpy()

avg_on = on_series_np
if len(on_series_np) > fft_size:
avg_on = get_avg_signal(on_series_np, fft_size)

Comment thread main.py Outdated
Comment on lines +34 to +37
on_series_df = pd.read_csv(on_observation_filename)
on_series_df = on_series_df.filter(regex='power_au|y_axis')
on_series_np = on_series_df.to_numpy()

Comment thread main.py Outdated

calibrated_signal = avg_on / avg_off

fig, (ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1, figsize=(8, 10))
Comment thread main.py
from raw_data_pipeline_tools.data_convert_to_csv import dat_to_csv
from raw_data_pipeline_tools.average_signal_fftsize import get_avg_signal

def main():
Comment thread main.py Outdated
Comment on lines +22 to +31
# Files Management
if on_observation_filename.endswith(".dat"):
base_name = on_observation_filename.split('.')[0]
on_observation_filename = f"{base_name}.csv"
dat_to_csv(f"{base_name}.dat", on_observation_filename)

if off_observation_filename.endswith(".dat"):
base_name = off_observation_filename.split('.')[0]
off_observation_filename = f"{base_name}.csv"
dat_to_csv(f"{base_name}.dat", off_observation_filename)
"""
y_data_list = np.fromfile(filename, dtype=np.float32)

with open(csv_filename, 'w', newline='') as csvfile:
Comment thread main.py Outdated
off_observation_filename = f"{base_name}.csv"
dat_to_csv(f"{base_name}.dat", off_observation_filename)

# ON signal proccesing
Comment thread main.py Outdated
Comment on lines +42 to +69
# Check for Calibration & Plotting
if calibration:
off_series_df = pd.read_csv(off_observation_filename)
off_series_df = off_series_df.filter(regex='power_au|y_axis')
off_series_np = off_series_df.to_numpy()

avg_off = off_series_np
if len(off_series_np) > fft_size:
avg_off = get_avg_signal(off_series_np, fft_size)

calibrated_signal = avg_on / avg_off

fig, (ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1, figsize=(8, 10))

ax1.plot(frequencies, avg_off, color='blue')
ax1.set_title('Avg Cold/Off')
ax1.set_ylabel('Relative Power')

ax2.plot(frequencies, avg_on, color='red')
ax2.set_title('Avg Hot/On')
ax2.set_ylabel('Relative Power')

ax3.plot(frequencies, calibrated_signal, color='green')
ax3.set_title('On/Off calibration')
ax3.set_ylabel('Relative Power')
ax3.set_xlabel('Frequencies')
plt.tight_layout()
plt.show()

Copilot AI left a comment

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull request overview

Copilot reviewed 7 out of 9 changed files in this pull request and generated 7 comments.

Comment on lines +15 to +24
try:
on_signal_numpy = np.fromfile(on_file_path, dtype=np.float32)
off_signal_numpy = np.fromfile(off_file_path, dtype=np.float32)

except FileNotFoundError:
print("File not found") # noqa: T201
except Exception as e: # noqa: BLE001
print(f"Convert Error: {e}") # noqa: T201
else:
return on_signal_numpy, off_signal_numpy
Comment on lines +3 to +5
from .average_signal_fftsize import get_avg_signal
from .convert_to_numpy import convert_dat_to_numpy
from .preprocessing_plots import create_preprocessing_plots


def preprocessing_pipeline(
on_signal_filename: str, off_signal_filename: str, fft_size: int, calibration_method: str = "on/off", plot_analysis: bool = True
elif calibration_method == "on-off":
calibrated_signal: np.ndarray = on_spectrum_avg - off_spectrum_avg
else:
msg = "Calibration Method does not exists."
Comment on lines +5 to +12
def create_preprocessing_plots(on_spectrum_avg: np.array, off_spectrum_avg: np.array, calibrated_signal: np.array, fft_size: int) -> None:
"""
Create on_spectrum , off_spectrum , calibrated_spectrum plots in frequencies axes.

Args:
on_spectrum_avg (np.array): the on spectrum
off_spectrum_avg (np.array): the off spectrum
calibrated_signal (np.array): the calibrated spectrum
calibrated_signal (np.array): the calibrated spectrum
fft_size (int): the fft size used in the observation
"""
frequencies = np.linspace(1.4205 - 0.003840 / 2, 1.4205000 + 0.003840 / 2, fft_size)
Comment thread main.py Outdated
Comment on lines +1 to +10
# Import functions from folder
from raw_data_pipeline_tools.preprocessing_pipeline import preprocessing_pipeline


def main() -> None:
on_filename = "/home/dimitrios-pakakis/Desktop/Astro/data-analysis/2502202_Hot202020.dat"
off_filename = "/home/dimitrios-pakakis/Desktop/Astro/data-analysis/2502202_Cold202020.dat"
fft_size = 2048
_ = preprocessing_pipeline(
on_signal_filename=on_filename, off_signal_filename=off_filename, fft_size=fft_size, calibration_method="on/off", plot_analysis=True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants