import pandas as pd
import numpy as np
import os
from google.cloud import bigquery
JSON_PATH = "path to JSON FILE from Google"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=JSON_PATH
client = bigquery.Client()

High Level Idea

Pulling very specific GA4 data takes a bit of SQL knowledge, and intricate queries.

Writing these yourself is a headache, but with some python knowledge, its actually fairly easy.

Getting GA4 Running

Guide to set up the GA4 API https://developers.google.com/analytics/devguides/reporting/core/v4/quickstart/service-py

Explore a free GA4 dataset https://developers.google.com/analytics/bigquery/web-ecommerce-demo-dataset

# Lets grab a sample of the data, and just a few columns for simplicity sake
q = """
SELECT event_timestamp, event_name, event_params                                                                
FROM  `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _TABLE_SUFFIX BETWEEN '20201101' AND '20210131'
LIMIT 15
;
"""
sample_df = client.query(q).to_dataframe()
#grab one sample
sample_df.head(1)
event_timestamp event_name event_params
0 1605594151663973 page_view [{'key': 'page_referrer', 'value': {'string_va...

This data is filled with nested data here which is an issue if we want to do data-science or analysis

What if we want specific value inside “event_params” as its own column, which is currently nested inside event_params?

sample_df.iloc[0]['event_params'][:5]
array([{'key': 'ga_session_number', 'value': {'string_value': None, 'int_value': 2, 'float_value': None, 'double_value': None}},
       {'key': 'ga_session_id', 'value': {'string_value': None, 'int_value': 4389558959, 'float_value': None, 'double_value': None}},
       {'key': 'clean_event', 'value': {'string_value': 'gtm.js', 'int_value': None, 'float_value': None, 'double_value': None}},
       {'key': 'page_location', 'value': {'string_value': 'https://shop.googlemerchandisestore.com/Google+Redesign/Apparel/Google+Dino+Game+Tee', 'int_value': None, 'float_value': None, 'double_value': None}},
       {'key': 'engagement_time_msec', 'value': {'string_value': None, 'int_value': 40560, 'float_value': None, 'double_value': None}}],
      dtype=object)

Cross Join

This trick is given by google as its a common issue

Theoretically this is how it works, we join the main table every row of the nested table. This means for this one observation there would now be 5, one with each key Then we simply tell that we only want records with the key equal to ga_session_number

#It looks like this
querey = '''
SELECT
  event_timestamp,event_name, value.int_value as ga_session_id

FROM
    `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
    CROSS JOIN
      T.event_params

WHERE
  event_params.key = 'ga_session_id'
  AND _TABLE_SUFFIX BETWEEN '20201101' AND '20210131'
  LIMIT 500
  ;
'''
#Now lets look at the results
df_sample = client.query(querey).to_dataframe()
df_sample.head(10)
event_timestamp event_name ga_session_id
0 1609079909474813 first_visit 6267689519
1 1609079913730045 user_engagement 6267689519
2 1609079909474813 session_start 6267689519
3 1609079909474813 page_view 6267689519
4 1609045476565920 first_visit 1433811680
5 1609045476565920 page_view 1433811680
6 1609046700919403 scroll 1433811680
7 1609045476565920 session_start 1433811680
8 1609031351701297 first_visit 6998346161
9 1609031351701297 page_view 6998346161

Look how nice this looks! We have pulled out a nested subclause as a column Naturally the next question becomes, what about pulling out multiple of these nested columns?

Theoretically we just want to keep making new tables and join them to our previous tables resulting in multiple exploded columns all in one row

Lets turn this into two functions, one for generating a single querey for exploding one column out, and another for joining that to another table!

def single_querey(start_date,end_date,val_type,field,sub_field):
    start_date = '\''+start_date+ '\''
    end_date = '\''+end_date+ '\''
    querey= """
    SELECT
      event_timestamp,event_name, value.{2}_value as {4}
    FROM

        `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
        CROSS JOIN
          T.{3}
    WHERE
      {3}.key = '{4}'
      AND _TABLE_SUFFIX BETWEEN {0} AND {1};
    """.format(start_date,end_date,val_type,field,sub_field)
    #print(querey)
    return(querey)

def join_queries(q1,q2,joined_col='T'):
    q1=q1.replace(';','')
    q2=q2.replace(';','')
    q_new = """
    SELECT T.*,{2}.{2}
     FROM 
     ( {0} ) AS T 
     LEFT JOIN 
     ( {1} ) AS {2} 
     ON T.event_timestamp = {2}.event_timestamp
     AND
     T.event_name = {2}.event_name;
    """.format(q1,q2,joined_col)
    return(q_new)
#Now lets use these functions to grab 3 exploded out columns, page_location, page_title and ga_session_id
start_date = '20201101'
end_date =  '20201202'
q1 = single_querey(start_date,end_date,'int','event_params','ga_session_id')
q2 = single_querey(start_date,end_date,'string','event_params','page_title')
q3 = single_querey(start_date,end_date,'string','event_params','page_location')
q_new = join_queries(q1,q2,joined_col = 'page_title')
q_new = join_queries(q_new,q3,joined_col = 'page_location')
%%time
df = client.query(q_new).to_dataframe()
df.head()
CPU times: total: 2.42 s
Wall time: 18.7 s
event_timestamp event_name ga_session_id page_title page_location
0 1605498662409146 view_item 2200271228 Google Mural Socks https://shop.googlemerchandisestore.com/Google...
1 1605497966809821 user_engagement 2200271228 #IamRemarkable | Shop by Brand | Google Mercha... https://shop.googlemerchandisestore.com/Google...
2 1605498605857861 user_engagement 2200271228 Google Men's Puff Jacket Black https://shop.googlemerchandisestore.com/Google...
3 1605489713558309 session_start 970858984 Water Bottles | Drinkware | Google Merchandise... https://shop.googlemerchandisestore.com/Google...
4 1605535584700554 user_engagement 7103403152 Google Sherpa Zip Hoodie Charcoal https://shop.googlemerchandisestore.com/Google...

Here we are able to run a very complex querey that would be a serious headache to write, automatically using python, and a pull a full month of data in less than half a minute.

#Heres what that querey looks like
print(q_new)
    SELECT T.*,page_location.page_location
     FROM 
     ( 
    SELECT T.*,page_title.page_title
     FROM 
     ( 
    SELECT
      event_timestamp,event_name, value.int_value as ga_session_id
    FROM

        `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
        CROSS JOIN
          T.event_params
    WHERE
      event_params.key = 'ga_session_id'
      AND _TABLE_SUFFIX BETWEEN '20201101' AND '20201202'
     ) AS T 
     LEFT JOIN 
     ( 
    SELECT
      event_timestamp,event_name, value.string_value as page_title
    FROM

        `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
        CROSS JOIN
          T.event_params
    WHERE
      event_params.key = 'page_title'
      AND _TABLE_SUFFIX BETWEEN '20201101' AND '20201202'
     ) AS page_title 
     ON T.event_timestamp = page_title.event_timestamp
     AND
     T.event_name = page_title.event_name
     ) AS T 
     LEFT JOIN 
     ( 
    SELECT
      event_timestamp,event_name, value.string_value as page_location
    FROM

        `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
        CROSS JOIN
          T.event_params
    WHERE
      event_params.key = 'page_location'
      AND _TABLE_SUFFIX BETWEEN '20201101' AND '20201202'
     ) AS page_location 
     ON T.event_timestamp = page_location.event_timestamp
     AND
     T.event_name = page_location.event_name;