Spaces:

alwaysgood
/

my-tide-env

Sleeping

App Files Files Community

alwaysgood commited on Aug 20

Commit

c1d663e

verified ·

1 Parent(s): 902ce76

1

Browse files

Files changed (1) hide show

preprocessing.py +36 -31

preprocessing.py CHANGED Viewed

@@ -33,25 +33,25 @@ def convert_tide_level_to_residual(df, station_id):
     else:
         df['date'] = df['date'].dt.tz_convert(kst)
-    # 4. 마지막 144개 데이터만 사용 (슬라이싱 최적화)
-    df_last_144 = df.tail(144).copy()
-    start_time = df_last_144['date'].min()
-    end_time = df_last_144['date'].max()
-    print(f"📅 마지막 144개 데이터 시간 범위: {start_time} ~ {end_time}")
-    # 5. Supabase에서 harmonic_level 조회 (144개만)
     try:
         harmonic_data = get_harmonic_predictions(station_id, start_time, end_time)
         print(f"📊 조화 예측 데이터 {len(harmonic_data) if harmonic_data else 0}개 조회")
         if not harmonic_data:
             print("⚠️ 조화 예측 데이터가 없습니다. 가상 데이터로 대체합니다.")
-            return create_mock_residual_data(df_last_144)
     except Exception as e:
         print(f"❌ Supabase 조회 오류: {e}")
         print("⚠️ 가상 데이터로 대체합니다.")
-        return create_mock_residual_data(df_last_144)
     # 6. harmonic_data를 딕셔너리로 변환 (시간 기준)
     harmonic_dict = {}
@@ -80,11 +80,11 @@ def convert_tide_level_to_residual(df, station_id):
     print(f"📊 사용 가능한 조화 데이터: {len(harmonic_dict)}개")
-    # 7. residual 계산 (마지막 144개만)
     residual_values = []
     successful_conversions = 0
-    for idx, row in df_last_144.iterrows():
         tide_level = row['tide_level']
         timestamp = row['date']
@@ -101,8 +101,8 @@ def convert_tide_level_to_residual(df, station_id):
         # 이상치 플래그 확인
         is_outlier = False
-        if '_tide_outlier_flag' in df_last_144.columns:
-            is_outlier = df_last_144.at[idx, '_tide_outlier_flag'] if not pd.isna(df_last_144.at[idx, '_tide_outlier_flag']) else False
         if is_outlier:
             # 이상치로 탐지된 경우 residual = 0 (harmonic만 사용)
@@ -117,18 +117,18 @@ def convert_tide_level_to_residual(df, station_id):
             # 조화 데이터가 없으면 평균값으로 대체
             residual_values.append(0.0)
-    # 8. residual 컬럼 추가 (마지막 144개 데이터에만)
-    df_last_144['residual'] = residual_values
     # 9. tide_level 컬럼 제거 (모델에서 사용하지 않음)
-    if 'tide_level' in df_last_144.columns:
-        df_last_144 = df_last_144.drop(columns=['tide_level'])
         print("🗑️ tide_level 컬럼 제거 (변환 완료)")
-    conversion_rate = successful_conversions / len(df_last_144) * 100
-    print(f"✅ 변환 완료: {successful_conversions}/{len(df_last_144)} ({conversion_rate:.1f}%)")
-    return df_last_144
 def parse_time_string(time_str):
     """다양한 형태의 시간 문자열 파싱"""
@@ -697,7 +697,7 @@ def handle_missing_values(df, station_id=None):
 def preprocess_uploaded_file(file_path, station_id):
     """
     업로드된 파일의 전체 전처리 파이프라인
-    이상치 탐지 → 결측치 처리 → tide_level → residual 변환 + 검증
     """
     try:
         print(f"\n🚀 {station_id} 관측소 데이터 전처리 시작")
@@ -712,26 +712,31 @@ def preprocess_uploaded_file(file_path, station_id):
         if not is_valid:
             return None, f"입력 데이터 오류:\n" + "\n".join(issues)
-        # 3. 이상치 탐지 및 처리
-        print("\n🔍 이상치 탐지 및 처리 단계")
-        # 3-1. Harmonic 기반 tide_level 이상치 탐지
-        tide_outliers = detect_harmonic_based_outliers(df, station_id)
         if tide_outliers.any():
             print(f"🌊 tide_level 이상치 {tide_outliers.sum()}개 → residual=0 처리 예정")
-            df.loc[tide_outliers, '_tide_outlier_flag'] = True
-        # 3-2. 기상 데이터 물리적 한계 기반 이상치 탐지
-        weather_outliers = detect_weather_outliers(df)
         for col in weather_outliers.columns:
             if weather_outliers[col].any():
                 print(f"🌡️ {col} 이상치 {weather_outliers[col].sum()}개 → NaN 변환")
-                df.loc[weather_outliers[col], col] = np.nan
-        # 4. 결측치 처리
-        df_cleaned = handle_missing_values(df, station_id)
-        # 5. tide_level → residual 변환 (이상치 플래그 반영)
         converted_df = convert_tide_level_to_residual(df_cleaned, station_id)
         # 5. 변환된 데이터를 임시 파일로 저장

     else:
         df['date'] = df['date'].dt.tz_convert(kst)
+    # 4. 입력 데이터는 이미 144개로 슬라이싱된 상태
+    df_input = df.copy()
+    start_time = df_input['date'].min()
+    end_time = df_input['date'].max()
+    print(f"📅 입력 데이터 시간 범위: {start_time} ~ {end_time}")
+    # 5. Supabase에서 harmonic_level 조회
     try:
         harmonic_data = get_harmonic_predictions(station_id, start_time, end_time)
         print(f"📊 조화 예측 데이터 {len(harmonic_data) if harmonic_data else 0}개 조회")
         if not harmonic_data:
             print("⚠️ 조화 예측 데이터가 없습니다. 가상 데이터로 대체합니다.")
+            return create_mock_residual_data(df_input)
     except Exception as e:
         print(f"❌ Supabase 조회 오류: {e}")
         print("⚠️ 가상 데이터로 대체합니다.")
+        return create_mock_residual_data(df_input)
     # 6. harmonic_data를 딕셔너리로 변환 (시간 기준)
     harmonic_dict = {}
     print(f"📊 사용 가능한 조화 데이터: {len(harmonic_dict)}개")
+    # 7. residual 계산
     residual_values = []
     successful_conversions = 0
+    for idx, row in df_input.iterrows():
         tide_level = row['tide_level']
         timestamp = row['date']
         # 이상치 플래그 확인
         is_outlier = False
+        if '_tide_outlier_flag' in df_input.columns:
+            is_outlier = df_input.at[idx, '_tide_outlier_flag'] if not pd.isna(df_input.at[idx, '_tide_outlier_flag']) else False
         if is_outlier:
             # 이상치로 탐지된 경우 residual = 0 (harmonic만 사용)
             # 조화 데이터가 없으면 평균값으로 대체
             residual_values.append(0.0)
+    # 8. residual 컬럼 추가
+    df_input['residual'] = residual_values
     # 9. tide_level 컬럼 제거 (모델에서 사용하지 않음)
+    if 'tide_level' in df_input.columns:
+        df_input = df_input.drop(columns=['tide_level'])
         print("🗑️ tide_level 컬럼 제거 (변환 완료)")
+    conversion_rate = successful_conversions / len(df_input) * 100
+    print(f"✅ 변환 완료: {successful_conversions}/{len(df_input)} ({conversion_rate:.1f}%)")
+    return df_input
 def parse_time_string(time_str):
     """다양한 형태의 시간 문자열 파싱"""
 def preprocess_uploaded_file(file_path, station_id):
     """
     업로드된 파일의 전체 전처리 파이프라인
+    슬라이싱 → 이상치 탐지 → 결측치 처리 → tide_level → residual 변환 + 검증
     """
     try:
         print(f"\n🚀 {station_id} 관측소 데이터 전처리 시작")
         if not is_valid:
             return None, f"입력 데이터 오류:\n" + "\n".join(issues)
+        # 3. 마지막 144개로 먼저 슬라이싱 (모델 입력 크기)
+        print(f"✂️ 마지막 144개 데이터로 슬라이싱 (모델 입력 크기)")
+        df_sliced = df.tail(144).copy()
+        print(f"📊 슬라이싱 후 데이터: {len(df_sliced)}행 × {len(df_sliced.columns)}열")
+        # 4. 이상치 탐지 및 처리 (144개만 대상)
+        print("\n🔍 이상치 탐지 및 처리 단계 (144개 데이터 기준)")
+        # 4-1. Harmonic 기반 tide_level 이상치 탐지
+        tide_outliers = detect_harmonic_based_outliers(df_sliced, station_id)
         if tide_outliers.any():
             print(f"🌊 tide_level 이상치 {tide_outliers.sum()}개 → residual=0 처리 예정")
+            df_sliced.loc[tide_outliers, '_tide_outlier_flag'] = True
+        # 4-2. 기상 데이터 물리적 한계 기반 이상치 탐지
+        weather_outliers = detect_weather_outliers(df_sliced)
         for col in weather_outliers.columns:
             if weather_outliers[col].any():
                 print(f"🌡️ {col} 이상치 {weather_outliers[col].sum()}개 → NaN 변환")
+                df_sliced.loc[weather_outliers[col], col] = np.nan
+        # 5. 결측치 처리
+        df_cleaned = handle_missing_values(df_sliced, station_id)
+        # 6. tide_level → residual 변환 (이상치 플래그 반영)
         converted_df = convert_tide_level_to_residual(df_cleaned, station_id)
         # 5. 변환된 데이터를 임시 파일로 저장