From ca5e1ddbc02215a0a26d5f326b0e6658ae031e02 Mon Sep 17 00:00:00 2001 From: Giuseppe Nucifora Date: Tue, 19 Nov 2024 07:51:15 +0100 Subject: [PATCH] wip train dataset --- src/README.md | 2 +- .../working/models/technique_mapping.joblib | Bin 0 -> 66 bytes .../create_train_dataset.cpython-39.pyc | Bin 0 -> 12540 bytes .../create_train_dataset.py | 408 +++++++++++------- src/utils/__pycache__/__init__.cpython-39.pyc | Bin 172 -> 172 bytes src/utils/__pycache__/helpers.cpython-39.pyc | Bin 8309 -> 12313 bytes src/utils/helpers.py | 141 +++++- 7 files changed, 384 insertions(+), 167 deletions(-) create mode 100644 src/kaggle/working/models/technique_mapping.joblib create mode 100644 src/olive_oil_train_dataset/__pycache__/create_train_dataset.cpython-39.pyc diff --git a/src/README.md b/src/README.md index 33340b1..6040964 100644 --- a/src/README.md +++ b/src/README.md @@ -1 +1 @@ -python -m model_train.create_train_dataset --random-seed 42 --num-simulations 100000 --batch-size 10000 --max-workers 7 \ No newline at end of file +python -m olive_oil_train_dataset.create_train_dataset --random-seed 42 --num-simulations 100000 --batch-size 10000 --max-workers 7 diff --git a/src/kaggle/working/models/technique_mapping.joblib b/src/kaggle/working/models/technique_mapping.joblib new file mode 100644 index 0000000000000000000000000000000000000000..1c8b7ed53a30baf261c27bde4caf2ce35fb6fdb0 GIT binary patch literal 66 zcmZo*nQG1e0ku;!dN?!lN>cNRGs_aEcr*6!6_*yI79j~S_3)GwC8lInW#;E4=A=&X IW-iqO0Is?gZU6uP literal 0 HcmV?d00001 diff --git a/src/olive_oil_train_dataset/__pycache__/create_train_dataset.cpython-39.pyc b/src/olive_oil_train_dataset/__pycache__/create_train_dataset.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac54765c6092373f72c37638df134e27d6821188 GIT binary patch literal 12540 zcmbVSTWlQHd7j(OUbssxmrL;~a&)n!l|@Q%5+`94%NJR>G?o>~j+=?y?P$+%In?g# z>ddSxPG=LebP^|l5Fr85i*0~>=z|`MzT}}#DGIbe`_iXkUIWx2nxrWb7p>CR{r)qv zyF-e0&@MS=X3qWm^Pm6n{l_Ym3KD+z|L)UG{{>0<3B9C$RQePy?{P(vn8f6kWXj^M zm@4i{OKWG$Oj|c~IWEtdSy7%dbGWOmd^>OEld^(Y5M^WLn79|sqPUmL67E`Syggw~ zw996>J!ww1D`urVWlpuLW)csP+!yVmZ1$#dPvNWf5}VtWWyxOLmdxc~)|Ph+Hoq&I$HdbDo{qmP zxzA{led3B~fpQE!)eRr-tp8n1T1x?AV)H($f+qC zoypgw+L=A#rYkiin!3t64cqgsb~>%+?%0i<-{Dcw^6HIFyW6sTo7Ln<^KY?sO^+rU zE!%QY)atd}x@)!VsNA&udXslv^Ec|<4a>8m*@o3>^jem0*I%V_d+l>v^xz)4lZL8aL+-5Y><#soo-moiX`BsA_Kltf*$l`hym-h!q zd}&ACmHLve?5McQp}eUvc|qzcOzEqBCR9RoLE4u4ny-i03bRtEF>PDzXZ$S7gwmT5 z(?hTl%Z3_0kWP%^=a?F1wiS>r7hHZ83+%Kk!)Y0AuWj>=(eZt!ZMAHJImT-pe$(cj zal`T~zhij47V^2{b-0IeyJdKe=i6<|!Car(!~T`_WJ(MqLGjtEFBy$a&-INpBPbdx z#`AW&!?);d-uR61ymiNTSqxy@*fQFKvRXcxX!lyalaPz3mv3}?_*CdB(z{;IcUtE7 zZHN0kt5uI1ntAKC<+Q9DEjv;j)KlAz8)dpYezxw^bUuMunR-%xYFIz%q29bT4l*X;=LeDqoOLuBFN$MjtBArpg~`FZ#`}9Lw)2{Vct0>Y)~9 zS%Hn+Rr|SQ+#D;SJTKl*YbO_GcjaBVUtp#F7%m9wehJq&u8CM=huS6D9c4cs=CG_;k<;OHGEc03SNRm>=$m8TlGkN6&5pdD!}@0KD*YqjG|PoY z*z5v`U6uMX;Y`wQA6+ParwHoLj!LomxqWZ*;aoUBBe6M*zYxx1X7h9ClT9s1;T-y2 z6nP87ym{mu6?u!pyanVfq20^LVr?J8B8>G4DIP8^bV5?>NDL5oN51WMrP(P7@HVcuyxfhH)e z4SPAm9!o|8eUn~5)ub2Dl3Ikb;bM3+TnY_#_N^&W*th3S`ZDN=1p%+JW)3g@c!8Mp{|CG0;exTgaz^rN%4|0FX)W|k6)o#N#^Te)ozrJCa zCDL|>y|zRtba{Qtv0JQ8?MkGNN3WAqI&3c*Oj3;lS=R05fvhZsgrfJ&G3sITg0{mK zYL&c1g_x&lno%)5DBe=z!Q@O06%}3^+J`lnXF$yRN~ElwkFp8l39BJaj-WIuTwvfn zvRR*ej_7|DiC$HHCUiIH?+^d`d2lS|EZCv6YgR9`IxtqfCs#-PzB`FNzby7L8+gPb_y8Yw79uV%`lZVxP z&V!V&_g8%6u!>Lj{2s~QeO(?}^YbpoK2Y+Z4 zis@QlwBKL^Jt|Dgf)wM8A&{~dKs#vk@~)T$oF6wOz@_v3d@zw+-Fo7aA{^o@V} z(l0OaV_5e-G82i(=Dv>0BPt5PdImPB+*d+{?Cnd^tFw4l`$opoZEH)LT^{ zOyx+fV^d_{RkXcYhMz_^yoMyw$Ww^aoAy?exox$2_5snt&!E;{;qqt>x}wYF_sk3l ziF&OVX%wg}H4NlMd0RNB zU^Y{8Akv}?T_pwx<-1h3j)~uDVl^vy0dm7DyGevXd)Kju{?oU!Y`pNzf7%5 z`^<=_GLSh#U*0o|f;I^fV3shSk=uzWEkZ|xeeq%7klOyq+G*xeXEI=Nmxzf zx^_vbHhdUB=6I5osFl%stXPKciE0^kF(R@#l#+rAWGChc7!nB<0kdH^hh*!fC$Jyj z05H~Z{f*XEdXjU=BnPBPH80b|W1YNiH2}N-VZ+q-=g{Q$kh10Fiih2cEc~95M@~oOe>v6fLXfkTx zf?|JP`>?l+F6-|rivFIW>c3Sq$IueFl$D{f-Nd$h&&ehr=eAU4hG32DH+1B7C=YjJ>HeuI?6 z;73H8z$IXXKO^Kw60D`Z0jC{6l@^loo`c1pz%9za z8P0!8f;+7;?e$!k7Y1Dgdfrq9Fq1T2;d6_n>m ztg=ElhIH)htQarLisYWbfi4X&({V9JeoMIDkJB(YV(cjkC)Yyr*)z>D!w(KwO7`vFs ztskL2@O|_@zj-VyqSYgSg$)1r<_TOUcOD^F>z>L#HhKdB|0wF6x~qVrjh#oShM3>! zgl}t@lW|~9XF}t{=7d>17G?=v5c|Y{mv**S#P$?^@s|N$Vq=r@k3a#e{KI@!pqs#h zTTT$b>=52ElpAR?+qhx*jSa&#JgapZh0P|+y>+K$ABH;&6c4GbgD?T|65L_vt{>3g z2V2Af{xArE)b&Va!O2emZ#X+~c8o+Y%dkDW*>gBdaAvPtaN5_5kJpA1pnk{@*w+JP z2=`&sRl4~V6rVMEo<)PEX94WQZ~?G*U3m2CP6H6iXr$Ous(T16tf8XK#0Z>_ao7z4 z#qhDx(JX10QFBtH34%XdY}@iMLxJK9!2SKr<5?bpD3AIT9~X-VfXFbXZM!k6P&^?1 z?yxhAAu9+-9lp|Nn2(=7KkBU0^SeF2-bHj}jg|+7^DsU1Mf@UwAW*0XSw9>s&O;Fl;i^9DJ$K#D_| z1hc^RV9zHIh$6b;IphU%&vD+t&K3wp zY7Mp#WX}q0)Cd#=sHO_n395rQBjj%|xwW8j&1oV7^ruK8ybaW?K2NbLus8*#VrjT$ zyhum`v+XWoV`V8&S07&wRO9()f-{9@J1+LQ?GDIk(`R@{UaqdLqKl#f41wEq$7S}N zNJdtavw>eA;Bzym?wjOg$F*9{TCkiDJ;luUKq3Mf&v#)kUqNPj-?W42!X<1HgqsrC z-N6T4Gis;!=ZLPKM*;{DqKN0|y+8>?Dn&Sn)24_r5L-=~S16Zu3nwn-U!>%-lw6^N zwveg9jWczxccTp-pNvdSMq;GFf%V`g+Ju)zI@~;+bN> zX2I{!&LV5Jb}TBUb(CPZXySk%kMcvk5RIh-H@9d41@VD}%#6lTA`fQ~5;rRuA;rF9 z)S+lc{v&y(gO_~V8_LZ*up;i0MlrLXl6a^zdQjS~AJ-Rynx(i&OjFMU55@6ub4)Nn z9iRNBXKLLW7Cg6g;7Z^*kO2K+>zFF_Um7~N1Qf)3k345nkL%-0irQkM@l9H`Y@4}w zV76}N)`>yuY(18AbySF*tm9}G2iZSRxcvhvd-sq?)tp>}f1?Q>rzjT`ct3g$-b-06 z%ZqYV)!fL~UYm*fo< zK9NdZ)N%N7@a8b;xO!TpJM!dlg@>3^gM6dGkAP0X6`#fBjl*Bs=jRBu5F&IXJc3M^ z5v3xiDuRX?cs()&gb>cp5Kh|H5$2bfE^-haLQWRJewpR)4DSdbO!7doxN`6@@^OD? zofj<%2yZI=0$KVf9Yg+@$S+c04{r!*iuTBBlPMTlnst2 zxPfJ0_gILNH5v{KCUZ&vTHFGJOQJ-CS@>tDEyX(cCn)(OB{WmMM#&;2gm8;s$zByt zdxYO56ZpwRzDk8;F2p*%AmA7NGG+We5)(n20ooId@5eVd*%8s02>B%EONudcF?dT5 zxcKZzMqntl+(;zzCJpli>SYmFe{y7n`irr3dvI*!$f$JR0IB@+P@L9T8yC2)nH`8I zS`L4S5;5#o=xLjh*D3i162MapaX4eNW;0J~7BlC6LiH|CLJL0tQl^jAjGDt=lstyo zl2Qb;Qo(iv81kN`rS9)*T3JP|$j@m9!4f5-{Dv+KV8<+AlV>|K#7n zI1SQWgCFT7Qe$FM6PA=u$Y9>{~W1nPi(-_9h-!5&@)RNw*=saSTau=W!rJ z#08VwvrI#-#xj7kL5713b^bo;dxI0-b5|_T}p^FF{QqSCs>6P4T>_Y&TBS5K&zOaHB*lRsAh%n zPOrJ)*P$07Dl8YkbAd3)xI7vY;BgFS9$-`c21fZlk&z4?3F-nWNY;VM5h=%IQT~wU zFeD}k6$nci#EC&3oakbzAP(RyDiOqiR}rfZI-HmyPWH;)-=hb?GlB}uI71s7?tvICGiGDT8Mwaqelyrl;h6!1&^3Rni^zQfNkrO)-RcIDD=RPqR}u>f%FJ_izDXxZuf(VM5)=P} zF^*s??m-|LSAu7fcI4xIIHb{QB7|WZ0?|m1Uki#WE5hhQZea5#u+HGrr4&wpvmJDD zLewGJ#*-K53sEXIk5|YZ4^FS1^Ey35PQ2t)+~7MLboqmYs~?OVP_J+{R}#ddM+*K+ zJa~KFx%SJhiH({xGwmFUiG$6C5vQY zf?2c}tD~`K!Y45aA*x^*P;)Br;PL6=jUU*R(ZsNlXztyo9ZIgRq>(kNy{Bh}R_*`j zhzlf2?f>!2{}jDOxjGK7LKN16;urhMY6J|~^7~1|J zRu>#iiVkCPw9LO8pL|_2BF(}%;-HdZV8OVkRJu$ERm`q%k#oFflu*wN>I0msuWn@oHw= zUc0*`q`g@o?>6@TR%5@+i@k7_T9b3a%ao96=eH>NArg4gTORM=$t)3y;jNN%0SGK2 ztHN6sXENgO|H$>9p$>%SPJWQEn}ii1w4cx!LuMkkx?YbV0*!dxEaI>gtau5t>{B-r>h|9FdjiI3la547^HV<_v9}D0iU^qq}84N!rSzj>*A8Xbyj5_{4vP zmaF8xF4dBN!EXy+m-O6Pu8^C^<#JCH5XjR-D36@N3W9#rLq$F*8%WE-F{I6w1OVy& JUxfnI_+NOhsS^MI literal 0 HcmV?d00001 diff --git a/src/olive_oil_train_dataset/create_train_dataset.py b/src/olive_oil_train_dataset/create_train_dataset.py index c265fa9..735973f 100644 --- a/src/olive_oil_train_dataset/create_train_dataset.py +++ b/src/olive_oil_train_dataset/create_train_dataset.py @@ -7,6 +7,10 @@ from tqdm import tqdm import os import argparse import sys +import gc +from utils.helpers import clean_column_name, get_growth_phase, calculate_weather_effect, calculate_water_need, \ + create_technique_mapping, preprocess_weather_data + def get_optimal_workers(): """Calcola il numero ottimale di workers basato sulle risorse del sistema""" @@ -26,150 +30,224 @@ def get_optimal_workers(): return max(1, optimal_workers) -def simulate_single_year(params): +def simulate_zone(base_weather, olive_varieties, year, zone, all_varieties, variety_techniques): """ - Simula un singolo anno di produzione. + Simula la produzione di olive per una singola zona. Args: - params: dict contenente: - - weather_annual: dati meteo annuali - - olive_varieties: informazioni sulle varietà - - sim_id: ID simulazione - - random_seed: seed per riproducibilità + base_weather: DataFrame con dati meteo di base per l'anno selezionato + olive_varieties: DataFrame con le informazioni sulle varietà di olive + zone: ID della zona + all_varieties: Array con tutte le varietà disponibili + variety_techniques: Dict con le tecniche disponibili per ogni varietà + + Returns: + Dict con i risultati della simulazione per la zona """ - np.random.seed(params['random_seed'] + params['sim_id']) + # Crea una copia dei dati meteo per questa zona specifica + zone_weather = base_weather.copy() - # Seleziona anno base e applica variazioni - weather = params['weather_annual'].sample(n=1, random_state=params['random_seed'] + params['sim_id']).iloc[0].copy() + # Genera variazioni meteorologiche specifiche per questa zona + zone_weather['temp_mean'] *= np.random.uniform(0.95, 1.05, len(zone_weather)) + zone_weather['precip_sum'] *= np.random.uniform(0.9, 1.1, len(zone_weather)) + zone_weather['solarenergy_sum'] *= np.random.uniform(0.95, 1.05, len(zone_weather)) - # Applica variazioni meteorologiche (±20%) - for col in weather.index: - if col != 'year': - weather[col] *= np.random.uniform(0.8, 1.2) + # Genera caratteristiche specifiche della zona + num_varieties = np.random.randint(1, 4) # 1-3 varietà per zona + selected_varieties = np.random.choice(all_varieties, size=num_varieties, replace=False) + hectares = np.random.uniform(1, 10) # Dimensione del terreno + percentages = np.random.dirichlet(np.ones(num_varieties)) # Distribuzione delle varietà - # Genera caratteristiche dell'oliveto - num_varieties = np.random.randint(1, 4) - selected_varieties = np.random.choice( - params['olive_varieties']['Varietà di Olive'].unique(), - size=num_varieties, - replace=False - ) - hectares = np.random.uniform(1, 10) - percentages = np.random.dirichlet(np.ones(num_varieties)) + # Inizializzazione contatori annuali + annual_production = 0 + annual_min_oil = 0 + annual_max_oil = 0 + annual_avg_oil = 0 + annual_water_need = 0 - annual_results = { - 'simulation_id': params['sim_id'], - 'year': weather['year'], - 'hectares': hectares, - 'num_varieties': num_varieties, - 'total_olive_production': 0, - 'total_oil_production': 0, - 'total_water_need': 0 - } + # Inizializzazione dizionario dati varietà + variety_data = {clean_column_name(variety): { + 'tech': '', + 'pct': 0, + 'prod_t_ha': 0, + 'oil_prod_t_ha': 0, + 'oil_prod_l_ha': 0, + 'min_yield_pct': 0, + 'max_yield_pct': 0, + 'min_oil_prod_l_ha': 0, + 'max_oil_prod_l_ha': 0, + 'avg_oil_prod_l_ha': 0, + 'l_per_t': 0, + 'min_l_per_t': 0, + 'max_l_per_t': 0, + 'avg_l_per_t': 0, + 'olive_prod': 0, + 'min_oil_prod': 0, + 'max_oil_prod': 0, + 'avg_oil_prod': 0, + 'water_need': 0 + } for variety in all_varieties} - # Aggiungi dati meteorologici - for col in weather.index: - if col != 'year': - annual_results[f'weather_{col}'] = weather[col] - - variety_details = [] + # Simula produzione per ogni varietà selezionata for i, variety in enumerate(selected_varieties): - variety_data = params['olive_varieties'][ - params['olive_varieties']['Varietà di Olive'] == variety - ] - technique = np.random.choice(variety_data['Tecnica di Coltivazione'].unique()) + # Seleziona tecnica di coltivazione casuale per questa varietà + technique = np.random.choice(variety_techniques[variety]) percentage = percentages[i] - variety_info = variety_data[ - variety_data['Tecnica di Coltivazione'] == technique + # Ottieni informazioni specifiche della varietà + variety_info = olive_varieties[ + (olive_varieties['Varietà di Olive'] == variety) & + (olive_varieties['Tecnica di Coltivazione'] == technique) ].iloc[0] - # Calcoli produzione con variabilità - production_data = calculate_production( - variety_info, weather, percentage, hectares, - params['sim_id'] + i + # Calcola produzione base con variabilità + base_production = variety_info['Produzione (tonnellate/ettaro)'] * 1000 * percentage * hectares / 12 + base_production *= np.random.uniform(0.9, 1.1) + + # Calcola effetti meteo sulla produzione + weather_effect = zone_weather.apply( + lambda row: calculate_weather_effect(row, variety_info['Temperatura Ottimale']), + axis=1 ) + monthly_production = base_production * (1 + weather_effect / 10000) + monthly_production *= np.random.uniform(0.95, 1.05, len(zone_weather)) - variety_details.append(production_data) + # Calcola produzione annuale per questa varietà + annual_variety_production = monthly_production.sum() - # Aggiorna totali - annual_results['total_olive_production'] += production_data['production'] - annual_results['total_oil_production'] += production_data['oil_production'] - annual_results['total_water_need'] += production_data['water_need'] + # Calcola rese di olio con variabilità + min_yield_factor = np.random.uniform(0.95, 1.05) + max_yield_factor = np.random.uniform(0.95, 1.05) + avg_yield_factor = (min_yield_factor + max_yield_factor) / 2 - # Aggiungi dettagli varietà - for i, detail in enumerate(variety_details): - prefix = f'variety_{i + 1}' - for key, value in detail.items(): - annual_results[f'{prefix}_{key}'] = value + min_oil_production = annual_variety_production * variety_info[ + 'Min Litri per Tonnellata'] / 1000 * min_yield_factor + max_oil_production = annual_variety_production * variety_info[ + 'Max Litri per Tonnellata'] / 1000 * max_yield_factor + avg_oil_production = annual_variety_production * variety_info[ + 'Media Litri per Tonnellata'] / 1000 * avg_yield_factor - # Calcola metriche per ettaro e KPI - annual_results['olive_production_ha'] = annual_results['total_olive_production'] / hectares - annual_results['oil_production_ha'] = annual_results['total_oil_production'] / hectares - annual_results['water_need_ha'] = annual_results['total_water_need'] / hectares + # Calcola fabbisogno idrico + base_water_need = ( + variety_info['Fabbisogno Acqua Primavera (m³/ettaro)'] + + variety_info['Fabbisogno Acqua Estate (m³/ettaro)'] + + variety_info['Fabbisogno Acqua Autunno (m³/ettaro)'] + + variety_info['Fabbisogno Acqua Inverno (m³/ettaro)'] + ) / 4 - # Calcola efficienze - if annual_results['total_olive_production'] > 0: - annual_results['yield_efficiency'] = annual_results['total_oil_production'] / annual_results[ - 'total_olive_production'] - else: - annual_results['yield_efficiency'] = 0 + monthly_water_need = zone_weather.apply( + lambda row: calculate_water_need(row, base_water_need, variety_info['Temperatura Ottimale']), + axis=1 + ) + monthly_water_need *= np.random.uniform(0.95, 1.05, len(monthly_water_need)) + annual_variety_water_need = monthly_water_need.sum() * percentage * hectares - if annual_results['total_water_need'] > 0: - annual_results['water_efficiency'] = annual_results['total_olive_production'] / annual_results[ - 'total_water_need'] - else: - annual_results['water_efficiency'] = 0 + # Aggiorna totali annuali + annual_production += annual_variety_production + annual_min_oil += min_oil_production + annual_max_oil += max_oil_production + annual_avg_oil += avg_oil_production + annual_water_need += annual_variety_water_need - return annual_results + # Aggiorna dati varietà + clean_variety = clean_column_name(variety) + variety_data[clean_variety].update({ + 'tech': clean_column_name(technique), + 'pct': percentage, + 'prod_t_ha': variety_info['Produzione (tonnellate/ettaro)'] * np.random.uniform(0.95, 1.05), + 'oil_prod_t_ha': variety_info['Produzione Olio (tonnellate/ettaro)'] * np.random.uniform(0.95, 1.05), + 'oil_prod_l_ha': variety_info['Produzione Olio (litri/ettaro)'] * np.random.uniform(0.95, 1.05), + 'min_yield_pct': variety_info['Min % Resa'] * min_yield_factor, + 'max_yield_pct': variety_info['Max % Resa'] * max_yield_factor, + 'min_oil_prod_l_ha': variety_info['Min Produzione Olio (litri/ettaro)'] * min_yield_factor, + 'max_oil_prod_l_ha': variety_info['Max Produzione Olio (litri/ettaro)'] * max_yield_factor, + 'avg_oil_prod_l_ha': variety_info['Media Produzione Olio (litri/ettaro)'] * avg_yield_factor, + 'l_per_t': variety_info['Litri per Tonnellata'] * np.random.uniform(0.98, 1.02), + 'min_l_per_t': variety_info['Min Litri per Tonnellata'] * min_yield_factor, + 'max_l_per_t': variety_info['Max Litri per Tonnellata'] * max_yield_factor, + 'avg_l_per_t': variety_info['Media Litri per Tonnellata'] * avg_yield_factor, + 'olive_prod': annual_variety_production, + 'min_oil_prod': min_oil_production, + 'max_oil_prod': max_oil_production, + 'avg_oil_prod': avg_oil_production, + 'water_need': annual_variety_water_need + }) + + # Appiattisci i dati delle varietà + flattened_variety_data = { + f'{variety}_{key}': value + for variety, data in variety_data.items() + for key, value in data.items() + } + + # Restituisci il risultato della zona + return { + 'year': year, + 'zone_id': zone + 1, + 'temp_mean': zone_weather['temp_mean'].mean(), + 'precip_sum': zone_weather['precip_sum'].sum(), + 'solar_energy_sum': zone_weather['solarenergy_sum'].sum(), + 'ha': hectares, + 'zone': f"zone_{zone + 1}", + 'olive_prod': annual_production, + 'min_oil_prod': annual_min_oil, + 'max_oil_prod': annual_max_oil, + 'avg_oil_prod': annual_avg_oil, + 'total_water_need': annual_water_need, + **flattened_variety_data + } -def generate_training_dataset_parallel(weather_data, olive_varieties, num_simulations=1000, - random_seed=42, max_workers=None, batch_size=500, - output_path='olive_training_dataset.parquet'): +def simulate_olive_production_parallel(weather_data, olive_varieties, num_simulations=5, num_zones=None, + random_seed=None, + max_workers=None, batch_size=500, + output_path='olive_simulation_dataset.parquet'): """ - Genera dataset di training utilizzando multiprocessing. + Versione corretta della simulazione parallelizzata con gestione batch e salvataggio file Args: - weather_data: DataFrame dati meteo - olive_varieties: DataFrame varietà olive - num_simulations: numero di simulazioni - random_seed: seed per riproducibilità - max_workers: numero massimo di workers - batch_size: dimensione batch - output_path: percorso file output + weather_data: DataFrame con dati meteo + olive_varieties: DataFrame con varietà di olive + num_simulations: numero di simulazioni da eseguire (default: 5) + num_zones: numero di zone per simulazione (default: None, usa num_simulations se non specificato) + random_seed: seed per riproducibilità (default: None) + max_workers: numero massimo di workers (default: None, usa get_optimal_workers) + batch_size: dimensione del batch per gestione memoria (default: 500) + output_path: percorso del file di output (default: 'olive_simulation_dataset.parquet') + + Returns: + DataFrame con i risultati delle simulazioni """ - np.random.seed(random_seed) + if random_seed is not None: + np.random.seed(random_seed) - # Prepara dati meteo annuali - weather_annual = weather_data.groupby('year').agg({ - 'temp': ['mean', 'min', 'max', 'std'], - 'humidity': ['mean', 'min', 'max'], - 'precip': ['sum', 'mean', 'std'], - 'solarradiation': ['mean', 'sum', 'std'], - 'cloudcover': ['mean'] - }).reset_index() + # Se num_zones non è specificato, usa num_simulations + if num_zones is None: + num_zones = num_simulations - weather_annual.columns = ['year'] + [ - f'{col[0]}_{col[1]}' for col in weather_annual.columns[1:] - ] + # Preparazione dati + create_technique_mapping(olive_varieties) + monthly_weather = preprocess_weather_data(weather_data) + all_varieties = olive_varieties['Varietà di Olive'].unique() + variety_techniques = { + variety: olive_varieties[olive_varieties['Varietà di Olive'] == variety]['Tecnica di Coltivazione'].unique() + for variety in all_varieties + } - # Calcola workers ottimali + # Calcolo workers ottimali usando get_optimal_workers if max_workers is None: max_workers = get_optimal_workers() + print(f"Utilizzando {max_workers} workers ottimali basati sulle risorse del sistema") - print(f"Utilizzando {max_workers} workers") - - # Calcola numero di batch + # Calcolo numero di batch num_batches = (num_simulations + batch_size - 1) // batch_size - print(f"Elaborazione di {num_simulations} simulazioni in {num_batches} batch") - - # Crea directory output se necessario - os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True) + print(f"Elaborazione di {num_simulations} simulazioni con {num_zones} zone in {num_batches} batch") + print(f"Totale record attesi: {num_simulations * num_zones:,}") # Lista per contenere tutti i DataFrame dei batch all_batches = [] + # Elaborazione per batch for batch_num in range(num_batches): start_sim = batch_num * batch_size end_sim = min((batch_num + 1) * batch_size, num_simulations) @@ -177,54 +255,79 @@ def generate_training_dataset_parallel(weather_data, olive_varieties, num_simula batch_results = [] - # Preparazione parametri per ogni simulazione - simulation_params = [ - { - 'weather_annual': weather_annual, - 'olive_varieties': olive_varieties, - 'sim_id': sim_id, - 'random_seed': random_seed - } - for sim_id in range(start_sim, end_sim) - ] - - # Esegui simulazioni in parallelo + # Parallelizzazione usando ProcessPoolExecutor per il batch corrente with ProcessPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(simulate_single_year, params) - for params in simulation_params] + # Calcola il numero totale di task per questo batch + # Ogni simulazione nel batch corrente genererà num_zones zone + total_tasks = current_batch_size * num_zones - with tqdm(total=current_batch_size, + with tqdm(total=total_tasks, desc=f"Batch {batch_num + 1}/{num_batches}") as pbar: - for future in as_completed(futures): + # Dizionario per tenere traccia delle futures e dei loro sim_id + future_to_sim_id = {} + + # Sottometti i lavori per tutte le simulazioni e zone nel batch corrente + for sim in range(start_sim, end_sim): + selected_year = np.random.choice(monthly_weather['year'].unique()) + base_weather = monthly_weather[monthly_weather['year'] == selected_year].copy() + base_weather.loc[:, 'growth_phase'] = base_weather['month'].apply(get_growth_phase) + + # Sottometti i lavori per tutte le zone di questa simulazione + for zone in range(num_zones): + future = executor.submit( + simulate_zone, + base_weather=base_weather, + olive_varieties=olive_varieties, + year=selected_year, + zone=zone, + all_varieties=all_varieties, + variety_techniques=variety_techniques + ) + future_to_sim_id[future] = (sim + 1, zone + 1) + + # Raccogli i risultati man mano che vengono completati + for future in as_completed(future_to_sim_id.keys()): + sim_id, zone_id = future_to_sim_id[future] try: result = future.result() + result['simulation_id'] = sim_id + result['zone_id'] = zone_id batch_results.append(result) pbar.update(1) except Exception as e: - print(f"Errore in simulazione: {str(e)}") + print(f"Errore nella simulazione {sim_id}, zona {zone_id}: {str(e)}") continue - # Converti risultati in DataFrame + # Converti batch_results in DataFrame e aggiungi alla lista dei batch batch_df = pd.DataFrame(batch_results) all_batches.append(batch_df) + # Stampa statistiche del batch + print(f"\nStatistiche Batch {batch_num + 1}:") + print(f"Righe processate: {len(batch_df):,}") + print(f"Memoria utilizzata: {batch_df.memory_usage(deep=True).sum() / 1024 ** 2:.2f} MB") + # Libera memoria del batch_results + del batch_df + gc.collect() # Forza garbage collection # Concatena tutti i batch e salva + print("\nConcatenazione dei batch e salvataggio...") final_df = pd.concat(all_batches, ignore_index=True) + + # Crea directory output se necessario + os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True) + + # Salva il dataset final_df.to_parquet(output_path) + # Stampa statistiche finali + print("\nStatistiche Finali:") + print(f"Totale simulazioni completate: {len(final_df):,}") + print(f"Memoria totale utilizzata: {final_df.memory_usage(deep=True).sum() / 1024 ** 2:.2f} MB") print(f"\nDataset salvato in: {output_path}") - # Statistiche finali - print("\nStatistiche finali:") - print(f"Righe totali: {len(final_df)}") - print("\nAnalisi variabilità:") - for col in ['olive_production_ha', 'oil_production_ha', 'water_need_ha']: - cv = final_df[col].std() / final_df[col].mean() - print(f"{col}: CV = {cv:.2%}") - return final_df @@ -319,7 +422,6 @@ def calculate_solar_effect(radiation): return base_factor * np.random.uniform(0.8, 1.2) - def parse_arguments(): """ Configura e gestisce i parametri da riga di comando @@ -339,10 +441,17 @@ def parse_arguments(): parser.add_argument( '--num-simulations', type=int, - default=1000000, + default=100000, help='Numero totale di simulazioni da eseguire' ) + parser.add_argument( + '--num-zones', + type=int, + default=None, + help='Numero di zone per simulazione (default: uguale a num-simulations)' + ) + parser.add_argument( '--batch-size', type=int, @@ -360,25 +469,21 @@ def parse_arguments(): parser.add_argument( '--max-workers', type=int, - default=2, - help='Quantità di workers' + default=None, + help='Quantità di workers (default: usa get_optimal_workers)' ) return parser.parse_args() -# Esempio di utilizzo if __name__ == "__main__": print("Generazione dataset di training...") - - # Parsing argomenti args = parse_arguments() # Carica dati try: - # Carica dati weather_data = pd.read_parquet('./sources/weather_data_complete.parquet') olive_varieties = pd.read_parquet('./sources/olive_varieties.parquet') except Exception as e: @@ -389,36 +494,23 @@ if __name__ == "__main__": print("\nConfigurazione:") print(f"Random seed: {args.random_seed}") print(f"Numero simulazioni: {args.num_simulations:,}") - print(f"Workers: {args.max_workers:,}") + print(f"Numero zone per simulazione: {args.num_zones if args.num_zones is not None else args.num_simulations:,}") + print(f"Workers: {args.max_workers if args.max_workers is not None else 'auto'}") print(f"Dimensione batch: {args.batch_size:,}") print(f"File output: {args.output_path}") # Genera dataset try: - df = generate_training_dataset_parallel( + df = simulate_olive_production_parallel( weather_data=weather_data, olive_varieties=olive_varieties, - random_seed=args.random_seed, num_simulations=args.num_simulations, + num_zones=args.num_zones, + random_seed=args.random_seed, batch_size=args.batch_size, output_path=args.output_path, max_workers=args.max_workers ) except Exception as e: print(f"Errore durante la generazione del dataset: {str(e)}") - sys.exit(1) - - print("\nShape dataset:", df.shape) - print("\nColonne disponibili:") - print(df.columns.tolist()) - - print("\nStatistiche di base:") - print(df.describe()) - - # Analisi variabilità - print("\nAnalisi coefficienti di variazione:") - for col in ['olive_production_ha', 'oil_production_ha', 'water_need_ha']: - cv = df[col].std() / df[col].mean() - print(f"{col}: {cv:.2%}") - - print("\nDataset salvato './sources/olive_training_dataset.parquet'") \ No newline at end of file + sys.exit(1) \ No newline at end of file diff --git a/src/utils/__pycache__/__init__.cpython-39.pyc b/src/utils/__pycache__/__init__.cpython-39.pyc index 7c5c2b8a07c546a4fbd8234e8d60888cba1c295b..f493464e1ffc2da8c8732540ab5fc480f8982bd8 100644 GIT binary patch delta 18 YcmZ3(xQ3A{k(ZZ?0SIyzP2^ex03{>@HUIzs delta 18 YcmZ3(xQ3A{k(ZZ?0SM|CCUPwT03z`Nr2qf` diff --git a/src/utils/__pycache__/helpers.cpython-39.pyc b/src/utils/__pycache__/helpers.cpython-39.pyc index 1e7d28da3d54cb5e3ea09ee2abc8dd63d57796b6..597d4c031b4fc15c8bc0690b75056681a1c4c3f9 100644 GIT binary patch delta 5745 zcma)AYit}>6`nh@J3IUEdhKld%FAiuB-p>ESO6{R&*l0w;1ZN_`Y>v3iu zxijm;X5CeZDbP|uv_hh7BH1lM7*tU(lomlNwd!A}s03ofkNyFX{s9DtDgwfH?yPrh zCsi}6J#!!D-nsXj?>z4P*FDdlOqv}XF$GWj^}7o8;q%F@?4I-c_tcpa&gxv@suL+1 zS)&-uMwx;-b7I+;e8;nKE5Wr3jE8u5A(ZW~l2(%I7gQeM2I`$wXgXyXRm~en^SqNwQ_*TAcLCN;>?R*E`1ALJ0#Cr=L=DYar1(w~) zNB9umgVHv>bCmDp`%v7@Z{it#BT75?etrN0201&fWNuy=WhWW)8|T#?(C@2*bzd)9 z#j=>qc6w#cbaFv0SD5fOnKP#Cm=lh*rZL^@uUH~?rYuh31=h_)^K7%~<|byF{C;P- z;@L$^8gRPHis*sTEzf{lq;D1Cshh_Ul1t!>S8-B(TLJ_-&u38ExJYD`VA5Zj|BRB=06 zGO8c)tg~L@J*{VD!T2Rh4xsfiJ1Dl`Ew(nsV|_#WQS`%Z#j!nKbF7lMks4xovtmQ| z_MDXoi8~t4#`?E!SWJgk&gHQq2h_DHjkjaF*u5)%j&0N8J7|i~T)AZVjb!q~Z8S2& z#0Xx?s;>@@`jHt^*p@fDQDCt{j_rE+aV+L`Q z$ODZ%ou6hmH7cFM`eACgukqc^A^kQgKGgV4=MB4VrQ*gl7c4JF>dJA`GcPvMsk@lj zcqDa@om_b`wKrstfR;Cl?JKXOKgEVf6ym{a%o97r?R>Dze>OUMN-VqbWY3b?VSzW3 z7TamaWaA6{qc@7vR2D>BA|8<{kr^Uqh@2%d+jzQvjt%VE_Ykpktj^(-&sf5bOj@Q_71mnGHZBefu|GE68F<|II)-FaadKtPmOAU8 zm1H{Bh@-wn95r+bvVg2ERh$DtZlM7Yx%g~H^GnifT=7-njI~zi3R<1`8j+`oY|wO< z)O4`cV1@@E(T2ILk3F+8w{3;7ry9dMdh6eyvCo0{QQD|9>5X&snViFhOP(k@j>U5= z#z{=+hA>!3rIqyU5XGff<95T zy_{=#Swq+rs{}5&%Q}R93>+J`oFefRMEQ(2m8(p_zGrEx($6phKBoCs#S0jmr%kRt zQ55LIa??6SafYp`{Ys7XVauD8v@);q5Zb~BXmx$Hkw>;D^BOnMfpC>pcno!7i-Rq3 zN)?~u9rIzHoY%QAA1U3_?CC^Lg{RQdg`RHo>}mFFLeDd34Wm`(Y4jve3S$6 zZjsMF#sbH4Gj#?{?~h4*0VpXnk~ab@q|fr>o>i>mtjS3$?`7k{v`ed%m|Ja^C6}8w z9V;7?OhGvco_a1Qcf#dQ*d-WOFxy5v_F;6s>cHLQ&cLXr5aa{KbA-z~u`oqVvp$y6 zC<2C<&NjSNY7K>Sre9H2{eS8eHIiUkSv|?Nt6~wu^AKw5AzLHMk{}*Mfitc`;TmA9 z_5qy$iiWl(jMGpxu*wKZ1S2S0AD33Us@*ZE@xp(W|Nh&DCmtCKVi;UjUCD&~#OjtP zC}pD)Ft=REvbYp-HW2cpnfJ<~DdeECQAX)CGNQO3=lB81hr9vF2TtlGlrE1BkDM|K z1;-kxQyv6oFj6dY%W+3q{xw%LD;0DMPnRbgdt#l5lVn=b8Ak<@Vg1UqvdET{MRkb` zT;sL7h^RIO(@qiDZIJf6IQ-t z=gmQG58hXHJbT8RL$+nDbFic`Qq^(%T?*YcmhwdR7u9(-uh!HW<4{yXJq-^F_{y+K zVFJGMGkE#T7Im_R4T2PNX!{xDCTMSS2l4`lvt*2t!LcQpLrxhYckdo*WI%4tUCAAvg;c~^OVJHfn>4*bo&vMWk}Wo z@%tfOEmn|;5g`3UQ#U!{%_dExGcCQjepjo!y3h^#^ZVL6st56YmGf0Xi#*1t-CB>< z!;ne-7@u6H{&mJNg)+lLs6*_g4Ruf+7d66zHW3c!kh_6c| zL8V7+#~LqtM`^qF3sDy073vQ;Ws{4Wi2r*;Wa_Q~9lkzkJ5I^;b<_1`aWbOFoe2wa zW#UIfUIpG!)QCM*6A(ugmk}jhM{x|R$$+pi3{6*VlmRG&SDWA=>n?)~Xk*?zXwxYT9 zaf)8!BLsOWsq8$%_gfF)@jBrsR=_KMO5`;n>yuvzzu#Zis_Jd{Wuk55iJ#M~uM;7l zi8qK)1{;9w7bs=)jcDq`nHXnioXrCZ6~CeZ*Dw#it0kv;^T6T$no!k^0SY2_QjM{0 zW=LeERq+m5*CC4(*g_Va&#r;2>jPF@BCDn?Mi9e7OUTP9a(@zPJ{dIPEv#yzS}3mk zsil_vB!oKLf{QSV8;zFRZaIf*VkTo;hnjp1`FjR;N3BV5XmP4m+xErVa)E@uV3`DG zzqhHFjZ6A~v_nUcHQG_dyQr@-E<$h|e~MBr9ZzZs94S(#0y%Y7KmzrW4Ewn1!etkJ zb=dR;1D%vLuz-~tJPJE!(;Du&l&>J2CVNMU&T>c+prQDkkYdF*rjR*v+ne?Eim>u_B^y0sm+IWDAk)Hqhf}Wdd>H`tlQQ2HCJ#=7 z+4!3LI~%Fa*d=bAy=vd&?W9w=0R26*{E8^&E!S=3!jw3!flWK4kUyczB^(y2m5EtjL)z!Zg@uz~j`mSMD=j*s zW@D`+bkgrE;;=jA%&yMA0my#u$1LGPf=M4fsi;Y$;Og4nM~V0iOuR?yVe){2*URL8 z<@>7FPC;pipwp|^2Li2`C2qRnajJxoMpb6TQ?dgAj*hbgK0)q1un+0CC`<`jP!J#K z0?Lmbfv}DWTzKG;4Br>`Qol)EVH}}N&yV0kLYT9HZZpI)MCfcU&${xgAhT$h7|Ez7 z9h9^NxxY2il73lQmT?FfcGa=&hKPa#i-AM5aVrtz<)B(}C^gixC3O&|>nn*doKzsb Z*nbnJ`gWu@88Kt8p~sT3RIED|`#*7WemVdE delta 1752 zcmbW1O>7%Q6vsX5^?LoiN&FEf4J8DJIOGc?5K2R+qNoTV6-0|tHo#`>87G^rcg^fN zt%Hz34IuRZlsO@yN}Nz`Nalo4FK|JexS^Ghkl+9Zj!38yD(|h`HVT~Z%D>t7_W$0@ zdo#PAZ2fsQYi2UZ2>iZTTG+n1cq4m?L~cAf-IeTEQ_+=XT#u6ojZk8%dV(cs^pH@Q z#tvjX#nLQIl|zZfsS0+6$-7ymuyi>>6W~qK6nJx?Dh+mqX2I4%6)NFRbB9r?(IM~; z-JJtQn0xE~O`+awg}P14^k2MoAh88ie*igcl4a^`lfC4En~Z;kC=O=Wt%md9dqUf z&s&TeS6qG>29Oq-=GB4e8MV$pujkyBZ#5xlU+Eo1cn*4g`^CkVJ;uGo`i>>uk8YBL z_$4|?hI%pikSG~Y@T~Ye_U76!-0B&#R^x2XGJVT+yg+WWZ99d9kyXNHU^tQjX<;dd@0r|Ue&=Mvk}J05`E|G1TK!V|n0+dt#|)ptHCIs<@pfJ0w1=h^@b=`Ix0!F?b{W+4%}>Qe z?F_jjKGNpNM(>vPh^%6Rjtacg`y>A}sbDMk+JEFpu28GqU*=!pg`z`(-mT(i(($?NLtaA|h%JPPs3GbIieQKaVq4hdCGlN(?upmY#52JO;P*T5t0sczzSRo**WaF4 zi(8H#DD4*1!T6k*9UD9U3I^$jO~eL3PY?PW)_dQwoQBIMKoHy$;Nc7s*lWjnqs;>G z@z~VNI_AjS&0)J&7!TqNX8LW;j`vplG*%%$i|F{K`VNFtCBD&XjCV;FS5nRVpN2pn z-$7UiAAzrnu7u~ofa5d=W;NcgGc#CJ_>OtJ3y*@N&z1+(cpNL?ZzJA9oH!Rl;koG7 z)lc!fxH&OSuJyi}=n>Kp&goM3T?~8=5G1f8;oE;Q(Wpr&yzY+AUE5~VxF_6&-SlJ# zj%XU;*I|ST4sa-O{OofTF$oBimg!K_<1dAV-)ULSHkTq$6OW=R=Cqrw4)0(<8Zm_4 zsOR(WganCoINRqq>}U8D;LE7Fgqm2xc1=HsLwe554#&OX*ATd%W3`0s3BO?KO4Frn so2`<^LE_B int: @@ -215,12 +217,6 @@ def get_full_data(simulated_data: pd.DataFrame, return full_data - - - -import numpy as np -from typing import List, Dict - def prepare_static_features_multiple(varieties_info: List[Dict], percentages: List[float], hectares: float, @@ -376,4 +372,133 @@ def add_controlled_variation(base_value: float, max_variation_pct: float = 0.20) Valore con variazione applicata """ variation = np.random.uniform(-max_variation_pct, max_variation_pct) - return base_value * (1 + variation) \ No newline at end of file + return base_value * (1 + variation) + +def get_growth_phase(month): + if month in [12, 1, 2]: + return 'dormancy' + elif month in [3, 4, 5]: + return 'flowering' + elif month in [6, 7, 8]: + return 'fruit_set' + else: + return 'ripening' + +def calculate_weather_effect(row, optimal_temp): + # Effetti base + temp_effect = -0.1 * (row['temp_mean'] - optimal_temp) ** 2 + rain_effect = -0.05 * (row['precip_sum'] - 600) ** 2 / 10000 + sun_effect = 0.1 * row['solarenergy_sum'] / 1000 + + # Fattori di scala basati sulla fase di crescita + if row['growth_phase'] == 'dormancy': + temp_scale = 0.5 + rain_scale = 0.2 + sun_scale = 0.1 + elif row['growth_phase'] == 'flowering': + temp_scale = 2.0 + rain_scale = 1.5 + sun_scale = 1.0 + elif row['growth_phase'] == 'fruit_set': + temp_scale = 1.5 + rain_scale = 1.0 + sun_scale = 0.8 + else: # ripening + temp_scale = 1.0 + rain_scale = 0.5 + sun_scale = 1.2 + + # Calcolo dell'effetto combinato + combined_effect = ( + temp_scale * temp_effect + + rain_scale * rain_effect + + sun_scale * sun_effect + ) + + # Aggiustamenti specifici per fase + if row['growth_phase'] == 'flowering': + combined_effect -= 0.5 * max(0, row['precip_sum'] - 50) # Penalità per pioggia eccessiva durante la fioritura + elif row['growth_phase'] == 'fruit_set': + combined_effect += 0.3 * max(0, row['temp_mean'] - (optimal_temp + 5)) # Bonus per temperature più alte durante la formazione dei frutti + + return combined_effect + +def calculate_water_need(weather_data, base_need, optimal_temp): + # Calcola il fabbisogno idrico basato su temperatura e precipitazioni + temp_factor = 1 + 0.05 * (weather_data['temp_mean'] - optimal_temp) # Aumenta del 5% per ogni grado sopra l'ottimale + rain_factor = 1 - 0.001 * weather_data['precip_sum'] # Diminuisce leggermente con l'aumentare delle precipitazioni + return base_need * temp_factor * rain_factor + +def create_technique_mapping(olive_varieties, mapping_path='./kaggle/working/models/technique_mapping.joblib'): + # Estrai tutte le tecniche uniche dal dataset e convertile in lowercase + all_techniques = olive_varieties['Tecnica di Coltivazione'].str.lower().unique() + + # Crea il mapping partendo da 1 + technique_mapping = {tech: i + 1 for i, tech in enumerate(sorted(all_techniques))} + + # Salva il mapping + os.makedirs(os.path.dirname(mapping_path), exist_ok=True) + joblib.dump(technique_mapping, mapping_path) + + return technique_mapping + + +def encode_techniques(df, mapping_path='./kaggle/working/models/technique_mapping.joblib'): + if not os.path.exists(mapping_path): + raise FileNotFoundError(f"Mapping not found at {mapping_path}. Run create_technique_mapping first.") + + technique_mapping = joblib.load(mapping_path) + + # Trova tutte le colonne delle tecniche + tech_columns = [col for col in df.columns if col.endswith('_tech')] + + # Applica il mapping a tutte le colonne delle tecniche + for col in tech_columns: + df[col] = df[col].str.lower().map(technique_mapping).fillna(0).astype(int) + + return df + + +def decode_techniques(df, mapping_path='./kaggle/working/models/technique_mapping.joblib'): + if not os.path.exists(mapping_path): + raise FileNotFoundError(f"Mapping not found at {mapping_path}") + + technique_mapping = joblib.load(mapping_path) + reverse_mapping = {v: k for k, v in technique_mapping.items()} + reverse_mapping[0] = '' # Aggiungi un mapping per 0 a stringa vuota + + # Trova tutte le colonne delle tecniche + tech_columns = [col for col in df.columns if col.endswith('_tech')] + + # Applica il reverse mapping a tutte le colonne delle tecniche + for col in tech_columns: + df[col] = df[col].map(reverse_mapping) + + return df + + +def decode_single_technique(technique_value, mapping_path='./kaggle/working/models/technique_mapping.joblib'): + if not os.path.exists(mapping_path): + raise FileNotFoundError(f"Mapping not found at {mapping_path}") + + technique_mapping = joblib.load(mapping_path) + reverse_mapping = {v: k for k, v in technique_mapping.items()} + reverse_mapping[0] = '' + + return reverse_mapping.get(technique_value, '') + +def preprocess_weather_data(weather_df): + # Calcola statistiche mensili per ogni anno + monthly_weather = weather_df.groupby(['year', 'month']).agg({ + 'temp': ['mean', 'min', 'max'], + 'humidity': 'mean', + 'precip': 'sum', + 'windspeed': 'mean', + 'cloudcover': 'mean', + 'solarradiation': 'sum', + 'solarenergy': 'sum', + 'uvindex': 'max' + }).reset_index() + + monthly_weather.columns = ['year', 'month'] + [f'{col[0]}_{col[1]}' for col in monthly_weather.columns[2:]] + return monthly_weather \ No newline at end of file