DAG: data_cleaning_MLP

schedule: 0 14 * * *


data_cleaning_MLP

Toggle wrap
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Airflow dependencies
from asyncio import Task
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.models import Variable

from jinja2 import Template
from functools import reduce
from datetime import datetime, timedelta
from standardized_process.dags.functions.data_cleaning import *


import json 
import os 

task_loaded = False

# Default arguments
default_args = {
    'owner': 'pedro',
    'start_date': datetime(2023, 4, 22, 0, 0, 0)
}

year = "2024"#datetime.now().year
month = "4"#datetime.now().month
day = "3"#datetime.now().day

mine_site = "Los Pelambres"#os.getenv('MINE_SITE')
drilling_source = "Surface Manager"#os.getenv('DRILL_SOURCE')

# Aplica templating Jinja al archivo JSON
with open('dags/standardized_process/dags/configuration/data_cleaning_config.json') as f:
    template_content = f.read()

# Crear una plantilla Jinja2
template = Template(template_content)
rendered_config = template.render(mine_site=mine_site, drilling_source=drilling_source, year=year, month=month, day=day)
data = json.loads(rendered_config)

# Diccionario con un mapa de todas las funciones
function_mapping = {
    'normalize_headers': normalize_headers,
    'drill_groups_label': drill_groups_label,
    'prepare_columns': prepare_columns,
    'coordinates_limit_filter': coordinates_limit_filter,
    'minesite_limit_filter':minesite_limit_filter,
    'unit_converter':unit_converter,
    'zero_samples_filter': zero_samples_filter, 
    'iqr_limit_filter':iqr_limit_filter,
    'z_score_filter': z_score_filter,
    'add_id': add_id,
    'register_per_meter': register_per_meter,
    'get_bench_phase_values':get_bench_phase_values
}

def chain_tasks(x, y):
    return x << y

dag = DAG("data_cleaning_MLP", schedule_interval='0 14 * * *', tags=['Standardized Process'], default_args=default_args, max_active_runs=1, catchup=False, is_paused_upon_creation=True)

list_task = []
for task in data['task_scheme']:
    if task['active'] == True: 
        drill_report_task = PythonOperator(
            task_id = task['task_id'],
            python_callable = function_mapping[task['function']],
            op_kwargs = task['op_kwargs'],
            retries=3,
            dag = dag
        )
        list_task.append(drill_report_task)

# Establecer dependencias
for i in range(len(list_task) - 1):
    list_task[i] >> list_task[i + 1]