import sqlalchemy
import pyodbc
from pandas import DataFrame
from bokeh.plotting import figure, output_file, show
import pandas as pd
from xlsxwriter.utility import xl_rowcol_to_cell

df = DataFrame


## connect to SqlServer Database and get information. 
try:

    import urllib
    params = urllib.quote_plus("DRIVER={SQL Server Native Client 11.0};SERVER=.\MSSQLSERVER_ENT;DATABASE=Antrak;Trusted_Connection=yes;")

    engine = sqlalchemy.create_engine('mssql+pyodbc:///?odbc_connect=%s"' % params)
    resoverall = engine.execute("SELECT top 10 * FROM FlightBookingsMain")


    df = df(resoverall.fetchall())
    df.columns = resoverall.keys()

    
except (RuntimeError, TypeError, NameError):
    print('Error in Conneccting')
    print(RuntimeError, TypeError, NameError)
finally:
    print("connected")

connected

#lets look at the head of the dataframe
df.head()

#lots plot both a circle and line graph
p.circle(df['Booking Date'], df['AFP'], size=4, color='darkgrey', alpha=0.2, legend='close')
p.line(df['Booking Date'], df['Total Cost'], color='navy', legend='avg')

<bokeh.models.renderers.GlyphRenderer at 0xc8c00b8>

# show the results
#output_file('linegraph.html')
show(p)

# bokeh server plotting
#sample server plotting code from bokeh

#from bokeh.plotting import figure, show, output_server

#p = figure(title="Server Plot")
#p.circle([1, 2, 3], [4, 5, 6])

#output_server("hover")

#show(p)

I came across this brilliant analyis by pbpython. and decided to play around with it on this data. you can see the full code and explanation on his website: http://pbpython.com/pandas-pivot-table-explained.html. To I have added just a little bit of some extra exploration, the key and major analysis are from pbpython so you can check him out.

For convenience sake, let’s define the SEX column as a category and set the order we want to view.

This isn’t strictly required but helps us keep the order we want as we work through analyzing the data.

df['Sex'] = df['Sex'].astype('category')
df['Sex'].cat.set_categories(['m','f'], inplace=True)

The simplest pivot table must have a dataframe and an index . In this case, let’s use the Name as our index.

df['Booking Date'] = pd.to_datetime(df['Booking Date'])

#get year from date

df['year'] = df['Booking Date'].apply(lambda x: x.year)

#get month from date

df['month'] = df['Booking Date'].apply(lambda x: x.month)

df.head()

#get all data less than a particular date
df[df['Booking Date']<'2014-10-28']

year_plot = pd.pivot_table(df, index=['year'])
year_plot

p.line(df['year'], df['Total Cost'], color='darkgrey', alpha=0.2, legend='year plot')
output_file('linegraph.html')
show(p)

pv_result = pd.pivot_table(df, index=['Last Name'])

type(pv_result)
pv_result

You can have multiple indexes as well. In fact, most of the pivot_table args can take multiple values via a list.

pd.pivot_table(df, index=['Last Name','Booking Date','Departure Date'])

For this purpose, the Flight Booking ID, Insurance and Postcode columns aren’t really useful. Let’s remove it by explicitly defining the columns we care about using the values field.

#let

sum_all =pd.pivot_table(df, index=['Last Name'], values=['Total Cost','AFP','Age'], aggfunc= np.sum)
sum_all

The price column automatically averages the data but we can do a count or a sum. Adding them is simple using aggfunc and np.sum .

ttlcost = pd.pivot_table(df, index=['Last Name'], values=['Total Cost'], aggfunc=np.sum)
ttlcost

aggfunc can take a list of functions. Let’s try a mean using the numpy mean function and len to get a count.

len_sum = pd.pivot_table(df, index=['Last Name'], values=['Total Cost'], aggfunc =[np.sum,len])
len_sum

I think one of the confusing points with the pivot_table is the use of columns and values . Remember, columns are optional – they provide an additional way to segment the actual values you care about. The aggregation functions are applied to the values you list.

# function to put dataframe on same worksheet # 2 frames per line
#format1 = workbook.add_format()

def multiple_dfsUp(df_list, sheets, file_name, spaces):
    writer = pd.ExcelWriter(file_name,engine='xlsxwriter')
    
    row = 0  # for row to insert data at in the excel
    col = 0  # for columns to insert data at in the excel
    
    rowtostart = 0
    increase = 0 #incrementer
    number_rows = 0 # total number of rows / length for the dataframe
    
    old_length = 0 # the last row/column pair where last insert / formatting was done
    
    for dataframe in df_list:
        if(increase%2==0):
            row = rowtostart
            col = 0
            rowtostart = rowtostart- 4
            ttlcost_pos = 0
            old_position =0 # last position where totalcost was located
            
        dataframe.to_excel(writer,sheet_name=sheets,startrow=row , startcol=col)
        number_rows = len(dataframe.index) #total rows length for the current dataframe
        
        workbook = writer.book
        worksheet = writer.sheets[sheets] #get access to the sheets
        
        
        #Add a number format for cells with money
        money_fmt = workbook.add_format({'num_format':'$#,##0','bold':True})

        #Add a percent a format with  1 decimal point
        percent_fmt = workbook.add_format({'num_format':'0.0%','bold':True})
        format2 = workbook.add_format({'bg_color': '#C6EFCE',
                               'font_color': '#006100'})
        
        #get index position for total cost
        ttlcost_pos = dataframe.columns.get_loc("Total Cost") + ttlcost_pos +1
        #get the count of rows
        #for column in range(len(dataframe.columns)):
        #print('dataframe INNER ',column)
        #cell location 
        #print('DATAFRAME ', dataframe)
        
        start_range = xl_rowcol_to_cell(row+1, ttlcost_pos)
        
        #print('START range ', increase, start_range) #debugging
        #print('START row number ', increase, row) #debugging
        #print('START ttlcost_pos ', increase, ttlcost_pos)   #debugging
        
        
        end_range = xl_rowcol_to_cell(number_rows+row, ttlcost_pos)
        
        #print('END range ',increase, end_range)#debugging
        
        
        ttlcost_range = start_range + ":" + end_range
        
        ttlcost_color_range = ttlcost_range
        
        #print('TOTAL RANGE ', increase, ttlcost_range) #debugging
        
        #apply money formatting to total cost
        worksheet.conditional_format(ttlcost_range, {'type': 'cell',
                                         'criteria': '>',
                                         'value': 0,
                                         'format': money_fmt})
        
        # Highlight the top 5 values in Green
        worksheet.conditional_format(ttlcost_color_range, {'type': 'top',
                                           'value': '5',
                                           'format': format2})
        
            
        col = col + len(dataframe.columns) + spaces + 1
        rowtostart = rowtostart + len(dataframe.index) + spaces + 1
        increase = increase + 1
                
        #print('length of df',len(dataframe.columns)) #debugging
        
        if(ttlcost_pos>=len(dataframe.columns)-1):
           #ttlcost_pos = ttlcost_pos + spaces + 1
           ttlcost_pos = old_position +len(dataframe.columns)  + spaces + 1
        else:
           ttlcost_pos = len(dataframe.columns) + spaces + 1
        old_position = ttlcost_pos # old position to hold
        
    from datetime import datetime
    import xlsxwriter
    
    bold = workbook.add_format({'bold': 1})
    
    # Write some data headers.
    worksheet.write('A31', 'Item', bold)
    worksheet.write('B31', 'Date', bold)
    worksheet.write('C31', 'Cost', bold)
    
    # Add an Excel date format.
    date_format = workbook.add_format({'num_format': 'mmmm d yyyy'})
    
    expenses = (
     ['Rent', '2013-01-13', 1000],
     ['Gas',  '2013-01-14',  100],
     ['Food', '2013-01-16',  300],
     ['Gym',  '2013-01-20',   50],
     )
    
    row = 31
    col = 0

    for item, date_str, cost in (expenses):
        # Convert the date string into a datetime object.
        date = datetime.strptime(date_str, "%Y-%m-%d")

        worksheet.write_string  (row, col,     item              )
        worksheet.write_datetime(row, col + 1, date, date_format  )
        worksheet.write_number  (row, col + 2, cost, money_fmt)
        row += 1

    # Write a total using a formula.
    worksheet.write(row, 0, 'Total', bold)
    worksheet.write(row, 2, '=SUM(C32:C35)', money_fmt)
    
    chart = workbook.add_chart({'type': 'column'})
    
    
    writer.save()

# list of dataframes

dfs = [ttlcost,sum_all, pv_result, year_plot]
multiple_dfsUp(dfs, 'report', 'test1.xlsx', 1)

pv_result.to_html('filename.html')

note.¶

the index in the dataframe is not counted as part of the rows in excel when you visualize in excel. hence if you have one index and though it appears like there are 2 rows in excel when you view it, the first column is only the “index” and actually only one single column is counted

	First Name	Last Name	Sex	Phone	Fax	Email Address	Booking Date	Departure Date	Arrival Date	Address1	…	Number of Passengers	Total Cost	Currency Numeric Code	Company website	Flight ID Number	Flight Booking ID	Insurance	AFP	Flight Name	Age
0	Ellis	Harris	m	077 5665 3370	077 5665 3373	Ellis.E.Harris@mailinator.com	2012-12-18	2011-07-16	2000-08-23	63 Hull Road	…	2	983	710	http://qvc.com	4111113236285560	1	16	62	ZAR	62
1	Aaron	Field	m	079 6068 9016	079 6068 9015	Aaron.A.Field@mailinator.com	2014-10-28	2005-02-26	2000-07-30	42 Russell Rd	…	9	2452	710	http://msnbc.com	4111110402728960	2	83	38	ZAR	44
2	Archie	Kaur	m	070 3056 9804	070 3056 9809	Archie.M.Kaur@spambob.com	2003-11-17	2000-12-05	2007-03-07	10 Southend Avenue	…	8	1710	710	http://ap.org	4111110879255550	3	54	58	ZAR	60
3	Bradley	Middleton	m	070 0765 7814	070 0765 7816	Bradley.A.Middleton@dodgeit.com	2013-07-18	2011-02-26	2015-10-25	32 Pier Road	…	6	518	710	http://tripadvisor.com	4111111855573320	4	72	43	ZAR	66
4	Evan	Reeves	m	070 8266 4451	070 8266 4458	Evan.A.Reeves@dodgeit.com	2006-07-20	2006-06-20	2001-02-25	75 Moulton Road	…	7	1943	710	http://aliexpress.com	4111113022178850	5	68	52	ZAR	38

	First Name	Last Name	Sex	Phone	Fax	Email Address	Booking Date	Departure Date	Arrival Date	Address1	…	Currency Numeric Code	Company website	Flight ID Number	Flight Booking ID	Insurance	AFP	Flight Name	Age	year	month
0	Ellis	Harris	NaN	077 5665 3370	077 5665 3373	Ellis.E.Harris@mailinator.com	2012-12-18	2011-07-16	2000-08-23	63 Hull Road	…	710	http://qvc.com	4111113236285560	1	16	62	ZAR	62	2012	12
1	Aaron	Field	NaN	079 6068 9016	079 6068 9015	Aaron.A.Field@mailinator.com	2014-10-28	2005-02-26	2000-07-30	42 Russell Rd	…	710	http://msnbc.com	4111110402728960	2	83	38	ZAR	44	2014	10
2	Archie	Kaur	NaN	070 3056 9804	070 3056 9809	Archie.M.Kaur@spambob.com	2003-11-17	2000-12-05	2007-03-07	10 Southend Avenue	…	710	http://ap.org	4111110879255550	3	54	58	ZAR	60	2003	11
3	Bradley	Middleton	NaN	070 0765 7814	070 0765 7816	Bradley.A.Middleton@dodgeit.com	2013-07-18	2011-02-26	2015-10-25	32 Pier Road	…	710	http://tripadvisor.com	4111111855573320	4	72	43	ZAR	66	2013	7
4	Evan	Reeves	NaN	070 8266 4451	070 8266 4458	Evan.A.Reeves@dodgeit.com	2006-07-20	2006-06-20	2001-02-25	75 Moulton Road	…	710	http://aliexpress.com	4111113022178850	5	68	52	ZAR	38	2006	7

	First Name	Last Name	Sex	Phone	Fax	Email Address	Booking Date	Departure Date	Arrival Date	Address1	…	Currency Numeric Code	Company website	Flight ID Number	Flight Booking ID	Insurance	AFP	Flight Name	Age	year	month
0	Ellis	Harris	NaN	077 5665 3370	077 5665 3373	Ellis.E.Harris@mailinator.com	2012-12-18	2011-07-16	2000-08-23	63 Hull Road	…	710	http://qvc.com	4111113236285560	1	16	62	ZAR	62	2012	12
2	Archie	Kaur	NaN	070 3056 9804	070 3056 9809	Archie.M.Kaur@spambob.com	2003-11-17	2000-12-05	2007-03-07	10 Southend Avenue	…	710	http://ap.org	4111110879255550	3	54	58	ZAR	60	2003	11
3	Bradley	Middleton	NaN	070 0765 7814	070 0765 7816	Bradley.A.Middleton@dodgeit.com	2013-07-18	2011-02-26	2015-10-25	32 Pier Road	…	710	http://tripadvisor.com	4111111855573320	4	72	43	ZAR	66	2013	7
4	Evan	Reeves	NaN	070 8266 4451	070 8266 4458	Evan.A.Reeves@dodgeit.com	2006-07-20	2006-06-20	2001-02-25	75 Moulton Road	…	710	http://aliexpress.com	4111113022178850	5	68	52	ZAR	38	2006	7
6	Harley	Carroll	NaN	077 5239 3650	077 5239 3659	Harley.F.Carroll@dodgeit.com	2008-12-18	2007-09-28	2004-10-05	71 Stone St	…	710	http://fixya.com	4111111662317450	7	83	79	ZAR	60	2008	12
7	Christopher	Gibson	NaN	077 3549 4164	077 3549 4169	Christopher.M.Gibson@mailinator.com	2008-01-28	2011-05-04	2006-01-19	78 Bootham Terrace	…	710	http://vertitechnologygroup.com	4111112798242170	8	75	57	ZAR	60	2008	1
8	Isobel	Holden	NaN	079 2946 6947	079 2946 6946	Isobel.J.Holden@mailinator.com	2001-12-15	2009-05-25	2007-04-20	71 Front St	…	710	http://rr.com	4111114862347230	9	91	99	ZAR	27	2001	12
9	Alfie	Bolton	NaN	079 4864 1114	079 4864 1112	Alfie.K.Bolton@mailinator.com	2006-05-05	2002-11-29	2012-08-06	57 Ockham Road	…	710	http://nfl.com	4111111081803270	10	83	31	ZAR	30	2006	5

	AFP	Age	Flight Booking ID	Insurance	Number of Passengers	Postcode	Total Cost	month
year
2001	99.0	27.0	9.0	91.0	6.0	0.0	978.0	12.0
2003	58.0	60.0	3.0	54.0	8.0	0.0	1710.0	11.0
2006	41.5	34.0	7.5	75.5	7.0	0.0	2346.5	6.0
2008	68.0	60.0	7.5	79.0	8.5	0.0	2751.5	6.5
2012	62.0	62.0	1.0	16.0	2.0	0.0	983.0	12.0
2013	43.0	66.0	4.0	72.0	6.0	0.0	518.0	7.0
2014	38.0	44.0	2.0	83.0	9.0	0.0	2452.0	10.0
2015	14.0	46.0	6.0	55.0	6.0	0.0	1271.0	7.0

	AFP	Age	Flight Booking ID	Insurance	Number of Passengers	Postcode	Total Cost	month	year
Last Name
Bolton	31.0	30.0	10.0	83.0	7.0	0.0	2750.0	5.0	2006.0
Carroll	79.0	60.0	7.0	83.0	10.0	0.0	3869.0	12.0	2008.0
Field	38.0	44.0	2.0	83.0	9.0	0.0	2452.0	10.0	2014.0
Gibson	57.0	60.0	8.0	75.0	7.0	0.0	1634.0	1.0	2008.0
Harris	62.0	62.0	1.0	16.0	2.0	0.0	983.0	12.0	2012.0
Holden	99.0	27.0	9.0	91.0	6.0	0.0	978.0	12.0	2001.0
Kaur	36.0	53.0	4.5	54.5	7.0	0.0	1490.5	9.0	2009.0
Middleton	43.0	66.0	4.0	72.0	6.0	0.0	518.0	7.0	2013.0
Reeves	52.0	38.0	5.0	68.0	7.0	0.0	1943.0	7.0	2006.0

Python Pandas Pivot Table Index location Percentage calculation on Two columns – XlsxWriter pt2

note.¶

Recent search terms:

Array Transposition – Numpy Python Data Analysis

How To Track The Effectiveness of Email Campaign – Some Points to Consider

Basic Plotting Using Bokeh Python Pandas Library – Scatter, Line Visualizations

Test a Logistic Regression Model – Data Analysis and Intrepretation

R Shiny App for Mining Fields Evaluation

Leave a Reply Cancel reply

DataPandas LTS

EXPLORE DataPandas

ImportAnt link

GET IN TOUCH

© 2023 DataPandas

note.¶

Recent search terms:

People Who Read The Above Post Also Read This:

Similar Posts

Leave a Reply Cancel reply

DataPandas LTS

EXPLORE DataPandas

ImportAnt link

GET IN TOUCH

© 2023 DataPandas

Review Cart