Python for Bigquerey and SQL
import pandas as pd
import numpy as np
import os
from google.cloud import bigquery
JSON_PATH = "path to JSON FILE from Google"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=JSON_PATH
client = bigquery.Client()
High Level Idea
Pulling very specific GA4 data takes a bit of SQL knowledge, and intricate queries.
Writing these yourself is a headache, but with some python knowledge, its actually fairly easy.
Getting GA4 Running
Guide to set up the GA4 API https://developers.google.com/analytics/devguides/reporting/core/v4/quickstart/service-py
Explore a free GA4 dataset https://developers.google.com/analytics/bigquery/web-ecommerce-demo-dataset
# Lets grab a sample of the data, and just a few columns for simplicity sake
q = """
SELECT event_timestamp, event_name, event_params
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _TABLE_SUFFIX BETWEEN '20201101' AND '20210131'
LIMIT 15
;
"""
sample_df = client.query(q).to_dataframe()
#grab one sample
sample_df.head(1)
| event_timestamp | event_name | event_params | |
|---|---|---|---|
| 0 | 1605594151663973 | page_view | [{'key': 'page_referrer', 'value': {'string_va... |
This data is filled with nested data here which is an issue if we want to do data-science or analysis
What if we want specific value inside “event_params” as its own column, which is currently nested inside event_params?
sample_df.iloc[0]['event_params'][:5]
array([{'key': 'ga_session_number', 'value': {'string_value': None, 'int_value': 2, 'float_value': None, 'double_value': None}},
{'key': 'ga_session_id', 'value': {'string_value': None, 'int_value': 4389558959, 'float_value': None, 'double_value': None}},
{'key': 'clean_event', 'value': {'string_value': 'gtm.js', 'int_value': None, 'float_value': None, 'double_value': None}},
{'key': 'page_location', 'value': {'string_value': 'https://shop.googlemerchandisestore.com/Google+Redesign/Apparel/Google+Dino+Game+Tee', 'int_value': None, 'float_value': None, 'double_value': None}},
{'key': 'engagement_time_msec', 'value': {'string_value': None, 'int_value': 40560, 'float_value': None, 'double_value': None}}],
dtype=object)
Cross Join
This trick is given by google as its a common issue
Theoretically this is how it works, we join the main table every row of the nested table. This means for this one observation there would now be 5, one with each key Then we simply tell that we only want records with the key equal to ga_session_number
#It looks like this
querey = '''
SELECT
event_timestamp,event_name, value.int_value as ga_session_id
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
CROSS JOIN
T.event_params
WHERE
event_params.key = 'ga_session_id'
AND _TABLE_SUFFIX BETWEEN '20201101' AND '20210131'
LIMIT 500
;
'''
#Now lets look at the results
df_sample = client.query(querey).to_dataframe()
df_sample.head(10)
| event_timestamp | event_name | ga_session_id | |
|---|---|---|---|
| 0 | 1609079909474813 | first_visit | 6267689519 |
| 1 | 1609079913730045 | user_engagement | 6267689519 |
| 2 | 1609079909474813 | session_start | 6267689519 |
| 3 | 1609079909474813 | page_view | 6267689519 |
| 4 | 1609045476565920 | first_visit | 1433811680 |
| 5 | 1609045476565920 | page_view | 1433811680 |
| 6 | 1609046700919403 | scroll | 1433811680 |
| 7 | 1609045476565920 | session_start | 1433811680 |
| 8 | 1609031351701297 | first_visit | 6998346161 |
| 9 | 1609031351701297 | page_view | 6998346161 |
Look how nice this looks! We have pulled out a nested subclause as a column Naturally the next question becomes, what about pulling out multiple of these nested columns?
Theoretically we just want to keep making new tables and join them to our previous tables resulting in multiple exploded columns all in one row
Lets turn this into two functions, one for generating a single querey for exploding one column out, and another for joining that to another table!
def single_querey(start_date,end_date,val_type,field,sub_field):
start_date = '\''+start_date+ '\''
end_date = '\''+end_date+ '\''
querey= """
SELECT
event_timestamp,event_name, value.{2}_value as {4}
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
CROSS JOIN
T.{3}
WHERE
{3}.key = '{4}'
AND _TABLE_SUFFIX BETWEEN {0} AND {1};
""".format(start_date,end_date,val_type,field,sub_field)
#print(querey)
return(querey)
def join_queries(q1,q2,joined_col='T'):
q1=q1.replace(';','')
q2=q2.replace(';','')
q_new = """
SELECT T.*,{2}.{2}
FROM
( {0} ) AS T
LEFT JOIN
( {1} ) AS {2}
ON T.event_timestamp = {2}.event_timestamp
AND
T.event_name = {2}.event_name;
""".format(q1,q2,joined_col)
return(q_new)
#Now lets use these functions to grab 3 exploded out columns, page_location, page_title and ga_session_id
start_date = '20201101'
end_date = '20201202'
q1 = single_querey(start_date,end_date,'int','event_params','ga_session_id')
q2 = single_querey(start_date,end_date,'string','event_params','page_title')
q3 = single_querey(start_date,end_date,'string','event_params','page_location')
q_new = join_queries(q1,q2,joined_col = 'page_title')
q_new = join_queries(q_new,q3,joined_col = 'page_location')
%%time
df = client.query(q_new).to_dataframe()
df.head()
CPU times: total: 2.42 s
Wall time: 18.7 s
| event_timestamp | event_name | ga_session_id | page_title | page_location | |
|---|---|---|---|---|---|
| 0 | 1605498662409146 | view_item | 2200271228 | Google Mural Socks | https://shop.googlemerchandisestore.com/Google... |
| 1 | 1605497966809821 | user_engagement | 2200271228 | #IamRemarkable | Shop by Brand | Google Mercha... | https://shop.googlemerchandisestore.com/Google... |
| 2 | 1605498605857861 | user_engagement | 2200271228 | Google Men's Puff Jacket Black | https://shop.googlemerchandisestore.com/Google... |
| 3 | 1605489713558309 | session_start | 970858984 | Water Bottles | Drinkware | Google Merchandise... | https://shop.googlemerchandisestore.com/Google... |
| 4 | 1605535584700554 | user_engagement | 7103403152 | Google Sherpa Zip Hoodie Charcoal | https://shop.googlemerchandisestore.com/Google... |
Here we are able to run a very complex querey that would be a serious headache to write, automatically using python, and a pull a full month of data in less than half a minute.
#Heres what that querey looks like
print(q_new)
SELECT T.*,page_location.page_location
FROM
(
SELECT T.*,page_title.page_title
FROM
(
SELECT
event_timestamp,event_name, value.int_value as ga_session_id
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
CROSS JOIN
T.event_params
WHERE
event_params.key = 'ga_session_id'
AND _TABLE_SUFFIX BETWEEN '20201101' AND '20201202'
) AS T
LEFT JOIN
(
SELECT
event_timestamp,event_name, value.string_value as page_title
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
CROSS JOIN
T.event_params
WHERE
event_params.key = 'page_title'
AND _TABLE_SUFFIX BETWEEN '20201101' AND '20201202'
) AS page_title
ON T.event_timestamp = page_title.event_timestamp
AND
T.event_name = page_title.event_name
) AS T
LEFT JOIN
(
SELECT
event_timestamp,event_name, value.string_value as page_location
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
CROSS JOIN
T.event_params
WHERE
event_params.key = 'page_location'
AND _TABLE_SUFFIX BETWEEN '20201101' AND '20201202'
) AS page_location
ON T.event_timestamp = page_location.event_timestamp
AND
T.event_name = page_location.event_name;