{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Selección y manejo de columnas --- 5:34 min\n", "\n", "* 5:34 min | Última modificación: Octubre 6, 2021" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Este tutorial esta basado en https://es.hortonworks.com/tutorial/beginners-guide-to-apache-pig/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparación" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "pd.set_option(\"display.notebook_repr_html\", False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Carga de los datos de los eventos de los conductores" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "En el siguiente código se crea crea un DataFrame de eventos de los conductores en el sistema." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['driverId', 'truckId', 'eventTime', 'eventType', 'longitude',\n", " 'latitude', 'eventKey', 'CorrelationId', 'driverName', 'routeId',\n", " 'routeName', 'eventDate'],\n", " dtype='object')" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Carga del archivo ddesde un repo en GitHub\n", "#\n", "truck_events = pd.read_csv(\n", " \"https://raw.githubusercontent.com/jdvelasq/datalabs/master/datasets/drivers/truck_event_text_partition.csv\",\n", " sep=\",\",\n", " thousands=None,\n", " decimal=\".\",\n", ")\n", "\n", "#\n", "# Columnas del dataframe\n", "#\n", "truck_events.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Obtención de un subconjunto de columnas usando `[` y `]`" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " driverId eventTime eventType\n", "0 14 59:21.4 Normal\n", "1 18 59:21.7 Normal\n", "2 27 59:21.7 Normal\n", "3 11 59:21.7 Normal\n", "4 22 59:21.7 Normal" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "truck_events[\n", " [\"driverId\", \"eventTime\", \"eventType\"],\n", "].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Obtención de un subcojunto de columnas usando filter()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " driverId eventTime eventType\n", "0 14 59:21.4 Normal\n", "1 18 59:21.7 Normal\n", "2 27 59:21.7 Normal\n", "3 11 59:21.7 Normal\n", "4 22 59:21.7 Normal" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "truck_events.filter(items=[\"driverId\", \"eventTime\", \"eventType\"]).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Obtención de un subconjunto de columnas usando una expresión regular" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " eventTime eventType longitude latitude driverName \\\n", "0 59:21.4 Normal -94.58 37.03 Adis Cesir \n", "1 59:21.7 Normal -89.66 39.78 Grant Liu \n", "2 59:21.7 Normal -90.21 38.65 Mark Lochbihler \n", "3 59:21.7 Normal -90.20 38.65 Jamie Engesser \n", "4 59:21.7 Normal -90.04 35.19 Nadeem Asghar \n", "\n", " routeName eventDate \n", "0 Joplin to Kansas City Route 2 2016-05-27-22 \n", "1 Springfield to KC Via Hanibal 2016-05-27-22 \n", "2 Springfield to KC Via Columbia Route 2 2016-05-27-22 \n", "3 Saint Louis to Memphis Route2 2016-05-27-22 \n", "4 Saint Louis to Chicago Route2 2016-05-27-22 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "truck_events.filter(regex=\"e$\", axis=1).head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " eventTime eventType eventKey eventDate\n", "0 59:21.4 Normal 14|25|9223370572464814373 2016-05-27-22\n", "1 59:21.7 Normal 18|16|9223370572464814089 2016-05-27-22\n", "2 59:21.7 Normal 27|105|9223370572464814070 2016-05-27-22\n", "3 59:21.7 Normal 11|74|9223370572464814123 2016-05-27-22\n", "4 59:21.7 Normal 22|87|9223370572464814101 2016-05-27-22" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Selección de las columnas que contienen 'ent'\n", "#\n", "truck_events.filter(like=\"ent\", axis=1).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Ordenamiento de las columnas en el dataframe" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " CorrelationId driverId driverName eventDate \\\n", "0 3.660000e+18 14 Adis Cesir 2016-05-27-22 \n", "1 3.660000e+18 18 Grant Liu 2016-05-27-22 \n", "2 3.660000e+18 27 Mark Lochbihler 2016-05-27-22 \n", "3 3.660000e+18 11 Jamie Engesser 2016-05-27-22 \n", "4 3.660000e+18 22 Nadeem Asghar 2016-05-27-22 \n", "... ... ... ... ... \n", "17070 1.000000e+03 11 Jamie Engesser 2016-06-02-20 \n", "17071 1.000000e+03 16 Tom McCuch 2016-06-02-20 \n", "17072 1.000000e+03 18 Grant Liu 2016-06-02-20 \n", "17073 1.000000e+03 10 George Vetticaden 2016-06-02-20 \n", "17074 1.000000e+03 19 Ajay Singh 2016-06-02-20 \n", "\n", " eventKey eventTime eventType latitude longitude \\\n", "0 14|25|9223370572464814373 59:21.4 Normal 37.03 -94.58 \n", "1 18|16|9223370572464814089 59:21.7 Normal 39.78 -89.66 \n", "2 27|105|9223370572464814070 59:21.7 Normal 38.65 -90.21 \n", "3 11|74|9223370572464814123 59:21.7 Normal 38.65 -90.20 \n", "4 22|87|9223370572464814101 59:21.7 Normal 35.19 -90.04 \n", "... ... ... ... ... ... \n", "17070 11|27|9223370571956432681 12:23.1 Normal 38.65 -90.20 \n", "17071 16|46|9223370571956431821 12:24.0 Normal 38.33 -94.35 \n", "17072 18|49|9223370571956432141 12:23.7 Normal 39.71 -90.52 \n", "17073 10|39|9223370571956431961 12:23.8 Normal 37.21 -93.34 \n", "17074 19|100|9223370571956431810 12:24.0 Normal 36.79 -97.37 \n", "\n", " routeId routeName truckId \n", "0 160405074 Joplin to Kansas City Route 2 25 \n", "1 1565885487 Springfield to KC Via Hanibal 16 \n", "2 1325562373 Springfield to KC Via Columbia Route 2 105 \n", "3 1567254452 Saint Louis to Memphis Route2 74 \n", "4 1198242881 Saint Louis to Chicago Route2 87 \n", "... ... ... ... \n", "17070 1198242881 Saint Louis to Chicago Route2 27 \n", "17071 160405074 Joplin to Kansas City Route 2 46 \n", "17072 1565885487 Springfield to KC Via Hanibal 49 \n", "17073 1390372503 Saint Louis to Tulsa 39 \n", "17074 1962261785 Wichita to Little Rock.kml 100 \n", "\n", "[17075 rows x 12 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "truck_events.sort_index(\n", " axis=1,\n", " ascending=True,\n", ").head()" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "## Cambio del nombre de columnas" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " driverId truckId event_time event_type longitude latitude \\\n", "0 14 25 59:21.4 Normal -94.58 37.03 \n", "1 18 16 59:21.7 Normal -89.66 39.78 \n", "2 27 105 59:21.7 Normal -90.21 38.65 \n", "3 11 74 59:21.7 Normal -90.20 38.65 \n", "4 22 87 59:21.7 Normal -90.04 35.19 \n", "\n", " eventKey CorrelationId driverName routeId \\\n", "0 14|25|9223370572464814373 3.660000e+18 Adis Cesir 160405074 \n", "1 18|16|9223370572464814089 3.660000e+18 Grant Liu 1565885487 \n", "2 27|105|9223370572464814070 3.660000e+18 Mark Lochbihler 1325562373 \n", "3 11|74|9223370572464814123 3.660000e+18 Jamie Engesser 1567254452 \n", "4 22|87|9223370572464814101 3.660000e+18 Nadeem Asghar 1198242881 \n", "\n", " routeName eventDate \n", "0 Joplin to Kansas City Route 2 2016-05-27-22 \n", "1 Springfield to KC Via Hanibal 2016-05-27-22 \n", "2 Springfield to KC Via Columbia Route 2 2016-05-27-22 \n", "3 Saint Louis to Memphis Route2 2016-05-27-22 \n", "4 Saint Louis to Chicago Route2 2016-05-27-22 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "truck_events.rename(\n", " columns={\n", " \"eventTime\": \"event_time\",\n", " \"eventType\": \"event_type\",\n", " },\n", ").head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.7 ('.venv': venv)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.7" }, "toc-autonumbering": false, "toc-showmarkdowntxt": false, "vscode": { "interpreter": { "hash": "99683f87c1354598af39114791cc78b29c5617e49c4bfa4c5fb7492534eeeb87" } } }, "nbformat": 4, "nbformat_minor": 4 }