In this quick notes, we will go through aggregation in Python.

This is part of lectures on Learning Python for Data Analysis and Visualization by Jose Portilla on Udemy.

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

 

url = “http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/”

In [2]:
dframe_wine =  pd.read_csv('winequality-red.csv', sep=';')

 

In [4]:
dframe_wine.head()

 

Out[4]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
In [5]:
#get the average alcohol content for all the wines
dframe_wine['alcohol'].mean()

 

Out[5]:
10.422983114446502
In [6]:
#define a function which will return the differnce between the min and maximum values
def max_to_min(arr):
    return arr.max() - arr.min()

 

In [7]:
wino = dframe_wine.groupby('quality')
wino.describe()

 

Out[7]:
alcohol chlorides citric acid density fixed acidity free sulfur dioxide pH residual sugar sulphates total sulfur dioxide volatile acidity
quality
3 count 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000
mean 9.955000 0.122500 0.171000 0.997464 8.360000 11.000000 3.398000 2.635000 0.570000 24.900000 0.884500
std 0.818009 0.066241 0.250664 0.002002 1.770875 9.763879 0.144052 1.401596 0.122020 16.828877 0.331256
min 8.400000 0.061000 0.000000 0.994710 6.700000 3.000000 3.160000 1.200000 0.400000 9.000000 0.440000
25% 9.725000 0.079000 0.005000 0.996150 7.150000 5.000000 3.312500 1.875000 0.512500 12.500000 0.647500
50% 9.925000 0.090500 0.035000 0.997565 7.500000 6.000000 3.390000 2.100000 0.545000 15.000000 0.845000
75% 10.575000 0.143000 0.327500 0.998770 9.875000 14.500000 3.495000 3.100000 0.615000 42.500000 1.010000
max 11.000000 0.267000 0.660000 1.000800 11.600000 34.000000 3.630000 5.700000 0.860000 49.000000 1.580000
4 count 53.000000 53.000000 53.000000 53.000000 53.000000 53.000000 53.000000 53.000000 53.000000 53.000000 53.000000
mean 10.265094 0.090679 0.174151 0.996542 7.779245 12.264151 3.381509 2.694340 0.596415 36.245283 0.693962
std 0.934776 0.076192 0.201030 0.001575 1.626624 9.025926 0.181441 1.789436 0.239391 27.583374 0.220110
min 9.000000 0.045000 0.000000 0.993400 4.600000 3.000000 2.740000 1.300000 0.330000 7.000000 0.230000
25% 9.600000 0.067000 0.030000 0.995650 6.800000 6.000000 3.300000 1.900000 0.490000 14.000000 0.530000
50% 10.000000 0.080000 0.090000 0.996500 7.500000 11.000000 3.370000 2.100000 0.560000 26.000000 0.670000
75% 11.000000 0.089000 0.270000 0.997450 8.400000 15.000000 3.500000 2.800000 0.600000 49.000000 0.870000
max 13.100000 0.610000 1.000000 1.001000 12.500000 41.000000 3.900000 12.900000 2.000000 119.000000 1.130000
5 count 681.000000 681.000000 681.000000 681.000000 681.000000 681.000000 681.000000 681.000000 681.000000 681.000000 681.000000
mean 9.899706 0.092736 0.243686 0.997104 8.167254 16.983847 3.304949 2.528855 0.620969 56.513950 0.577041
std 0.736521 0.053707 0.180003 0.001589 1.563988 10.955446 0.150618 1.359753 0.171062 36.993116 0.164801
min 8.500000 0.039000 0.000000 0.992560 5.000000 3.000000 2.880000 1.200000 0.370000 6.000000 0.180000
25% 9.400000 0.074000 0.090000 0.996200 7.100000 9.000000 3.200000 1.900000 0.530000 26.000000 0.460000
50% 9.700000 0.081000 0.230000 0.997000 7.800000 15.000000 3.300000 2.200000 0.580000 47.000000 0.580000
75% 10.200000 0.094000 0.360000 0.997900 8.900000 23.000000 3.400000 2.600000 0.660000 84.000000 0.670000
max 14.900000 0.611000 0.790000 1.003150 15.900000 68.000000 3.740000 15.500000 1.980000 155.000000 1.330000
6 count 638.000000 638.000000 638.000000 638.000000 638.000000 638.000000 638.000000 638.000000 638.000000 638.000000 638.000000
mean 10.629519 0.084956 0.273824 0.996615 8.347179 15.711599 3.318072 2.477194 0.675329 40.869906 0.497484
std 1.049639 0.039563 0.195108 0.002000 1.797849 9.940911 0.153995 1.441576 0.158650 25.038250 0.160962
min 8.400000 0.034000 0.000000 0.990070 4.700000 1.000000 2.860000 0.900000 0.400000 6.000000 0.160000
25% 9.800000 0.068250 0.090000 0.995402 7.000000 8.000000 3.220000 1.900000 0.580000 23.000000 0.380000
50% 10.500000 0.078000 0.260000 0.996560 7.900000 14.000000 3.320000 2.200000 0.640000 35.000000 0.490000
75% 11.300000 0.088000 0.430000 0.997893 9.400000 21.000000 3.410000 2.500000 0.750000 54.000000 0.600000
max 14.000000 0.415000 0.780000 1.003690 14.300000 72.000000 4.010000 15.400000 1.950000 165.000000 1.040000
7 count 199.000000 199.000000 199.000000 199.000000 199.000000 199.000000 199.000000 199.000000 199.000000 199.000000 199.000000
mean 11.465913 0.076588 0.375176 0.996104 8.872362 14.045226 3.290754 2.720603 0.741256 35.020101 0.403920
std 0.961933 0.029456 0.194432 0.002176 1.992483 10.175255 0.150101 1.371509 0.135639 33.191206 0.145224
min 9.200000 0.012000 0.000000 0.990640 4.900000 3.000000 2.920000 1.200000 0.390000 7.000000 0.120000
25% 10.800000 0.062000 0.305000 0.994765 7.400000 6.000000 3.200000 2.000000 0.650000 17.500000 0.300000
50% 11.500000 0.073000 0.400000 0.995770 8.800000 11.000000 3.280000 2.300000 0.740000 27.000000 0.370000
75% 12.100000 0.087000 0.490000 0.997360 10.100000 18.000000 3.380000 2.750000 0.830000 43.000000 0.485000
max 14.000000 0.358000 0.760000 1.003200 15.600000 54.000000 3.780000 8.900000 1.360000 289.000000 0.915000
8 count 18.000000 18.000000 18.000000 18.000000 18.000000 18.000000 18.000000 18.000000 18.000000 18.000000 18.000000
mean 12.094444 0.068444 0.391111 0.995212 8.566667 13.277778 3.267222 2.577778 0.767778 33.444444 0.423333
std 1.224011 0.011678 0.199526 0.002378 2.119656 11.155613 0.200640 1.295038 0.115379 25.433240 0.144914
min 9.800000 0.044000 0.030000 0.990800 5.000000 3.000000 2.880000 1.400000 0.630000 12.000000 0.260000
25% 11.325000 0.062000 0.302500 0.994175 7.250000 6.000000 3.162500 1.800000 0.690000 16.000000 0.335000
50% 12.150000 0.070500 0.420000 0.994940 8.250000 7.500000 3.230000 2.100000 0.740000 21.500000 0.370000
75% 12.875000 0.075500 0.530000 0.997200 10.225000 16.500000 3.350000 2.600000 0.820000 43.000000 0.472500
max 14.000000 0.086000 0.720000 0.998800 12.600000 42.000000 3.720000 6.400000 1.100000 88.000000 0.850000
In [8]:
#do an aggregation on the groupby object
wino.agg(max_to_min)

 

Out[8]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol
quality
3 4.9 1.140 0.66 4.5 0.206 31.0 40.0 0.00609 0.47 0.46 2.6
4 7.9 0.900 1.00 11.6 0.565 38.0 112.0 0.00760 1.16 1.67 4.1
5 10.9 1.150 0.79 14.3 0.572 65.0 149.0 0.01059 0.86 1.61 6.4
6 9.6 0.880 0.78 14.5 0.381 71.0 159.0 0.01362 1.15 1.55 5.6
7 10.7 0.795 0.76 7.7 0.346 51.0 282.0 0.01256 0.86 0.97 4.8
8 7.6 0.590 0.69 5.0 0.042 39.0 76.0 0.00800 0.84 0.47 4.2
In [16]:
wino.agg(sum)

 

Out[16]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol
quality
3 83.6 8.845 1.71 26.35 1.225 110.0 249.0 9.97464 33.98 5.70 99.550000
4 412.3 36.780 9.23 142.80 4.806 650.0 1921.0 52.81675 179.22 31.61 544.050000
5 5561.9 392.965 165.95 1722.15 63.153 11566.0 38486.0 679.02757 2250.67 422.88 6741.700000
6 5325.5 317.395 174.70 1580.45 54.202 10024.0 26075.0 635.84041 2116.93 430.86 6781.633333
7 1765.6 80.380 74.66 541.40 15.241 2795.0 6969.0 198.22475 654.86 147.51 2281.716667
8 154.2 7.620 7.04 46.40 1.232 239.0 602.0 17.91382 58.81 13.82 217.700000
In [14]:
wino.agg('count')

 

Out[14]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol
quality
3 10 10 10 10 10 10 10 10 10 10 10
4 53 53 53 53 53 53 53 53 53 53 53
5 681 681 681 681 681 681 681 681 681 681 681
6 638 638 638 638 638 638 638 638 638 638 638
7 199 199 199 199 199 199 199 199 199 199 199
8 18 18 18 18 18 18 18 18 18 18 18
In [17]:
#create a new column in dataframe as per your requirements
dframe_wine['alc / quality ratio'] = dframe_wine['alcohol'] / dframe_wine['quality']

 

In [18]:
dframe_wine

 

Out[18]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality alc / quality ratio
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5 1.880000
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 5 1.960000
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 5 1.960000
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 6 1.633333
4 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5 1.880000
5 7.4 0.660 0.00 1.8 0.075 13.0 40.0 0.99780 3.51 0.56 9.4 5 1.880000
6 7.9 0.600 0.06 1.6 0.069 15.0 59.0 0.99640 3.30 0.46 9.4 5 1.880000
7 7.3 0.650 0.00 1.2 0.065 15.0 21.0 0.99460 3.39 0.47 10.0 7 1.428571
8 7.8 0.580 0.02 2.0 0.073 9.0 18.0 0.99680 3.36 0.57 9.5 7 1.357143
9 7.5 0.500 0.36 6.1 0.071 17.0 102.0 0.99780 3.35 0.80 10.5 5 2.100000
10 6.7 0.580 0.08 1.8 0.097 15.0 65.0 0.99590 3.28 0.54 9.2 5 1.840000
11 7.5 0.500 0.36 6.1 0.071 17.0 102.0 0.99780 3.35 0.80 10.5 5 2.100000
12 5.6 0.615 0.00 1.6 0.089 16.0 59.0 0.99430 3.58 0.52 9.9 5 1.980000
13 7.8 0.610 0.29 1.6 0.114 9.0 29.0 0.99740 3.26 1.56 9.1 5 1.820000
14 8.9 0.620 0.18 3.8 0.176 52.0 145.0 0.99860 3.16 0.88 9.2 5 1.840000
15 8.9 0.620 0.19 3.9 0.170 51.0 148.0 0.99860 3.17 0.93 9.2 5 1.840000
16 8.5 0.280 0.56 1.8 0.092 35.0 103.0 0.99690 3.30 0.75 10.5 7 1.500000
17 8.1 0.560 0.28 1.7 0.368 16.0 56.0 0.99680 3.11 1.28 9.3 5 1.860000
18 7.4 0.590 0.08 4.4 0.086 6.0 29.0 0.99740 3.38 0.50 9.0 4 2.250000
19 7.9 0.320 0.51 1.8 0.341 17.0 56.0 0.99690 3.04 1.08 9.2 6 1.533333
20 8.9 0.220 0.48 1.8 0.077 29.0 60.0 0.99680 3.39 0.53 9.4 6 1.566667
21 7.6 0.390 0.31 2.3 0.082 23.0 71.0 0.99820 3.52 0.65 9.7 5 1.940000
22 7.9 0.430 0.21 1.6 0.106 10.0 37.0 0.99660 3.17 0.91 9.5 5 1.900000
23 8.5 0.490 0.11 2.3 0.084 9.0 67.0 0.99680 3.17 0.53 9.4 5 1.880000
24 6.9 0.400 0.14 2.4 0.085 21.0 40.0 0.99680 3.43 0.63 9.7 6 1.616667
25 6.3 0.390 0.16 1.4 0.080 11.0 23.0 0.99550 3.34 0.56 9.3 5 1.860000
26 7.6 0.410 0.24 1.8 0.080 4.0 11.0 0.99620 3.28 0.59 9.5 5 1.900000
27 7.9 0.430 0.21 1.6 0.106 10.0 37.0 0.99660 3.17 0.91 9.5 5 1.900000
28 7.1 0.710 0.00 1.9 0.080 14.0 35.0 0.99720 3.47 0.55 9.4 5 1.880000
29 7.8 0.645 0.00 2.0 0.082 8.0 16.0 0.99640 3.38 0.59 9.8 6 1.633333
1569 6.2 0.510 0.14 1.9 0.056 15.0 34.0 0.99396 3.48 0.57 11.5 6 1.916667
1570 6.4 0.360 0.53 2.2 0.230 19.0 35.0 0.99340 3.37 0.93 12.4 6 2.066667
1571 6.4 0.380 0.14 2.2 0.038 15.0 25.0 0.99514 3.44 0.65 11.1 6 1.850000
1572 7.3 0.690 0.32 2.2 0.069 35.0 104.0 0.99632 3.33 0.51 9.5 5 1.900000
1573 6.0 0.580 0.20 2.4 0.075 15.0 50.0 0.99467 3.58 0.67 12.5 6 2.083333
1574 5.6 0.310 0.78 13.9 0.074 23.0 92.0 0.99677 3.39 0.48 10.5 6 1.750000
1575 7.5 0.520 0.40 2.2 0.060 12.0 20.0 0.99474 3.26 0.64 11.8 6 1.966667
1576 8.0 0.300 0.63 1.6 0.081 16.0 29.0 0.99588 3.30 0.78 10.8 6 1.800000
1577 6.2 0.700 0.15 5.1 0.076 13.0 27.0 0.99622 3.54 0.60 11.9 6 1.983333
1578 6.8 0.670 0.15 1.8 0.118 13.0 20.0 0.99540 3.42 0.67 11.3 6 1.883333
1579 6.2 0.560 0.09 1.7 0.053 24.0 32.0 0.99402 3.54 0.60 11.3 5 2.260000
1580 7.4 0.350 0.33 2.4 0.068 9.0 26.0 0.99470 3.36 0.60 11.9 6 1.983333
1581 6.2 0.560 0.09 1.7 0.053 24.0 32.0 0.99402 3.54 0.60 11.3 5 2.260000
1582 6.1 0.715 0.10 2.6 0.053 13.0 27.0 0.99362 3.57 0.50 11.9 5 2.380000
1583 6.2 0.460 0.29 2.1 0.074 32.0 98.0 0.99578 3.33 0.62 9.8 5 1.960000
1584 6.7 0.320 0.44 2.4 0.061 24.0 34.0 0.99484 3.29 0.80 11.6 7 1.657143
1585 7.2 0.390 0.44 2.6 0.066 22.0 48.0 0.99494 3.30 0.84 11.5 6 1.916667
1586 7.5 0.310 0.41 2.4 0.065 34.0 60.0 0.99492 3.34 0.85 11.4 6 1.900000
1587 5.8 0.610 0.11 1.8 0.066 18.0 28.0 0.99483 3.55 0.66 10.9 6 1.816667
1588 7.2 0.660 0.33 2.5 0.068 34.0 102.0 0.99414 3.27 0.78 12.8 6 2.133333
1589 6.6 0.725 0.20 7.8 0.073 29.0 79.0 0.99770 3.29 0.54 9.2 5 1.840000
1590 6.3 0.550 0.15 1.8 0.077 26.0 35.0 0.99314 3.32 0.82 11.6 6 1.933333
1591 5.4 0.740 0.09 1.7 0.089 16.0 26.0 0.99402 3.67 0.56 11.6 6 1.933333
1592 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6 1.833333
1593 6.8 0.620 0.08 1.9 0.068 28.0 38.0 0.99651 3.42 0.82 9.5 6 1.583333
1594 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5 2.100000
1595 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 6 1.866667
1596 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6 1.833333
1597 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 5 2.040000
1598 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0 6 1.833333

1599 rows × 13 columns

In [20]:
#using pivot table instead of groupby to achieve same results
dframe_wine.pivot_table(index='quality')

 

Out[20]:
alc / quality ratio alcohol chlorides citric acid density fixed acidity free sulfur dioxide pH residual sugar sulphates total sulfur dioxide volatile acidity
quality
3 3.318333 9.955000 0.122500 0.171000 0.997464 8.360000 11.000000 3.398000 2.635000 0.570000 24.900000 0.884500
4 2.566274 10.265094 0.090679 0.174151 0.996542 7.779245 12.264151 3.381509 2.694340 0.596415 36.245283 0.693962
5 1.979941 9.899706 0.092736 0.243686 0.997104 8.167254 16.983847 3.304949 2.528855 0.620969 56.513950 0.577041
6 1.771587 10.629519 0.084956 0.273824 0.996615 8.347179 15.711599 3.318072 2.477194 0.675329 40.869906 0.497484
7 1.637988 11.465913 0.076588 0.375176 0.996104 8.872362 14.045226 3.290754 2.720603 0.741256 35.020101 0.403920
8 1.511806 12.094444 0.068444 0.391111 0.995212 8.566667 13.277778 3.267222 2.577778 0.767778 33.444444 0.423333
In [22]:
dframe_wine.groupby('quality').mean()

 

Out[22]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol alc / quality ratio
quality
3 8.360000 0.884500 0.171000 2.635000 0.122500 11.000000 24.900000 0.997464 3.398000 0.570000 9.955000 3.318333
4 7.779245 0.693962 0.174151 2.694340 0.090679 12.264151 36.245283 0.996542 3.381509 0.596415 10.265094 2.566274
5 8.167254 0.577041 0.243686 2.528855 0.092736 16.983847 56.513950 0.997104 3.304949 0.620969 9.899706 1.979941
6 8.347179 0.497484 0.273824 2.477194 0.084956 15.711599 40.869906 0.996615 3.318072 0.675329 10.629519 1.771587
7 8.872362 0.403920 0.375176 2.720603 0.076588 14.045226 35.020101 0.996104 3.290754 0.741256 11.465913 1.637988
8 8.566667 0.423333 0.391111 2.577778 0.068444 13.277778 33.444444 0.995212 3.267222 0.767778 12.094444 1.511806
In [24]:
#lets plot the data on a scatterplot
%matplotlib inline
dframe_wine.plot(kind='scatter', x='quality', y='alcohol')

 

Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a71c4c93c8>

 

Similar Posts

Leave a Reply

Your email address will not be published. Required fields are marked *