Natural language processing with DAAO#

This is an exploratory data analysis of collected data from DAAO. We focus on venues and organisations adopting natural language processing to reveal patterns in the data.

The visualisations consist of…

  • word clouds

  • dendrograms

  • time series and temporal bar charts

Import packages and pre-process data#

We have provided the code used to generate the data used in this notebook. Collapse Tip for more details on how to manage the data efficiently.

Hide code cell source
# for data mgmt
import numpy as np
from collections import Counter
import requests, gzip, io, os, json, pandas as pd
import ast

# for plotting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from itables import show

import warnings
warnings.filterwarnings("ignore")

# provide folder_name which contains uncompressed data i.e., csv and jsonl files
# only need to change this if you have already donwloaded data
# otherwise data will be fetched from google drive
global folder_name
folder_name = 'data/local'

def fetch_small_data_from_github(fname):
    url = f"https://raw.githubusercontent.com/acd-engine/jupyterbook/master/data/analysis/{fname}"
    response = requests.get(url)
    rawdata = response.content.decode('utf-8')
    return pd.read_csv(io.StringIO(rawdata))

def fetch_date_suffix():
    url = f"https://raw.githubusercontent.com/acd-engine/jupyterbook/master/data/analysis/date_suffix"
    response = requests.get(url)
    rawdata = response.content.decode('utf-8')
    try: return rawdata[:12]
    except: return None

def check_if_csv_exists_in_folder(filename):
    try: return pd.read_csv(os.path.join(folder_name, filename), low_memory=False)
    except: return None

def fetch_data(filetype='csv', acdedata='organization'):
    filename = f'acde_{acdedata}_{fetch_date_suffix()}.{filetype}'

    # first check if the data exists in current directory
    data_from_path = check_if_csv_exists_in_folder(filename)
    if data_from_path is not None: return data_from_path

    urls = fetch_small_data_from_github('acde_data_gdrive_urls.csv')
    sharelink = urls[urls.data == acdedata][filetype].values[0]
    url = f'https://drive.google.com/u/0/uc?id={sharelink}&export=download&confirm=yes'

    response = requests.get(url)
    decompressed_data = gzip.decompress(response.content)
    decompressed_buffer = io.StringIO(decompressed_data.decode('utf-8'))

    try:
        if filetype == 'csv': df = pd.read_csv(decompressed_buffer, low_memory=False)
        else: df = [json.loads(jl) for jl in pd.read_json(decompressed_buffer, lines=True, orient='records')[0]]
        return pd.DataFrame(df)
    except: return None 

def expand_DAAO_events(fname = 'DAAO_event_expanded_data.csv', save_locally=False):
    # first check if the data exists in current directory
    data_from_path = check_if_csv_exists_in_folder(fname)
    if data_from_path is not None: return data_from_path

    dfs = fetch_data(acdedata='event') # 40s

    dfs_expanded = []

    # get start and end years - 20 secs
    for idx,row in dfs[dfs.data_source.str.contains('DAAO')].iterrows():

        try: 
            this_locations = pd.json_normalize(ast.literal_eval(row['coverage_ranges']))

            for idx2, row2 in this_locations.iterrows():
                try: start_yr = row2['date_range.date_start.year']
                except: start_yr = None

                try: end_yr = row2['date_range.date_end.year']
                except: end_yr = None

                try: place_address = row2['place.ori_address']
                except: place_address = None

                try: 
                    latitude = row2['place.geo_coord.latitude']
                    longitude = row2['place.geo_coord.longitude']
                except:
                    latitude = None; longitude = None

                row['start_year'] = int(start_yr); row['end_year'] = int(end_yr)
                row['latitude'] = latitude; row['longitude'] = longitude
                row['place_address'] = place_address
                dfs_expanded.append(row)

        except:
            start_yr = None; end_yr = None; latitude = None; longitude = None; place_address = None
            dfs_expanded.append(row)

    # remove last column of the dataframe and return df
    if save_locally: pd.DataFrame(dfs_expanded).to_csv(f'{folder_name}/{fname}', index=False)
    return pd.DataFrame(dfs_expanded)

# read in expanded data
dfs_expanded = expand_DAAO_events(fname = 'DAAO_event_expanded_data.csv', save_locally=False)
dfs_expanded = dfs_expanded.drop_duplicates()

Summary statistics#

Firstly, we review at a high level the data we have collected in terms of richness across fields.

Hide code cell source
locations_cond = dfs_expanded.latitude.notnull()
year_cond = dfs_expanded.start_year.notnull()
desc_cond = dfs_expanded.description.notnull()
period_cond = (dfs_expanded.start_year >= 1900) & (dfs_expanded.start_year <= 2020)

print('In DAAO,') 
print(f'- there are {dfs_expanded.shape[0]} events.')
print(f'- there are {dfs_expanded[year_cond].shape[0]} events with dates.')
print(f'- there are {dfs_expanded[locations_cond & year_cond].shape[0]} events with geocodes and dates.')
print(f'- there are {dfs_expanded[locations_cond & year_cond & desc_cond].shape[0]} events with geocodes, dates and biographical information.')
print(f'- there are {dfs_expanded[locations_cond & year_cond & desc_cond & period_cond].shape[0]} events with geocodes, dates and biographical information after filtering for events between 1900 and 2020.')
In DAAO,
- there are 21838 events.
- there are 20544 events with dates.
- there are 17214 events with geocodes and dates.
- there are 7418 events with geocodes, dates and biographical information.
- there are 7248 events with geocodes, dates and biographical information after filtering for events between 1900 and 2020.
Hide code cell source
dfs_rich = dfs_expanded[locations_cond & year_cond & desc_cond & period_cond].copy()

type_count = dfs_rich['types'].value_counts().reset_index().rename(columns={'index':'type','types':'count'})
type_count['prop'] = round((type_count['count']/dfs_rich.shape[0])*100,2)
exh_prop = type_count['type'].str.contains('exhibition')
other_prop = type_count['type'].str.contains('other event')
fest_prop = type_count['type'].str.contains('festival')
perf_prop = type_count['type'].str.contains('performance')

# # create a decade column
dfs_rich['decade_start'] = [str(int(x))[:3]+'0' for x in dfs_rich['start_year']]
dfs_rich['decade_start'] = dfs_rich['decade_start'].astype(int)

print('\nWe use the 7,248 events for the remainder of the analysis. In this subset,') 
print(f'- there are {type_count[exh_prop]["count"].sum()} exhibitions ({round(type_count[exh_prop]["prop"].sum(),2)}%).')
print(f'- there are {type_count[other_prop]["count"].sum()} other-events ({round(type_count[other_prop]["prop"].sum(),2)}%).')
print(f'- there are {type_count[fest_prop]["count"].sum()} festivals ({round(type_count[fest_prop]["prop"].sum(),2)}%).')
print(f'- there are {type_count[perf_prop]["count"].sum()} performance-event ({round(type_count[perf_prop]["prop"].sum(),2)}%).')
print(f'- there are {dfs_rich["types"].isnull().sum()} events with missing type data ({round((dfs_rich["types"].isnull().sum()/dfs_rich.shape[0])*100,2)}%).')

print('\nIn terms of time,')
for t in range(1900,2021,10):
    if t != 2020: print(f'- there are {dfs_rich[dfs_rich.decade_start == t].shape[0]} events between {t}-{t+10} ({round((dfs_rich[dfs_rich.decade_start == t].shape[0]/dfs_rich.shape[0])*100,2)}%).')
    else: print(f'- there are {dfs_rich[dfs_rich.start_year == t].shape[0]} events in {t} ({round((dfs_rich[dfs_rich.start_year == t].shape[0]/dfs_rich.shape[0])*100,2)}%).')
We use the 7,248 events for the remainder of the analysis. In this subset,
- there are 6629 exhibitions (91.46%).
- there are 85 other-events (1.17%).
- there are 31 festivals (0.43%).
- there are 20 performance-event (0.28%).
- there are 488 events with missing type data (6.73%).

In terms of time,
- there are 33 events between 1900-1910 (0.46%).
- there are 31 events between 1910-1920 (0.43%).
- there are 39 events between 1920-1930 (0.54%).
- there are 40 events between 1930-1940 (0.55%).
- there are 41 events between 1940-1950 (0.57%).
- there are 90 events between 1950-1960 (1.24%).
- there are 159 events between 1960-1970 (2.19%).
- there are 629 events between 1970-1980 (8.68%).
- there are 1333 events between 1980-1990 (18.39%).
- there are 2141 events between 1990-2000 (29.54%).
- there are 2216 events between 2000-2010 (30.57%).
- there are 493 events between 2010-2020 (6.8%).
- there are 3 events in 2020 (0.04%).

Most frequent geocodes#

Before jumping into the visuals, we inspect the nuances of the geocodes attached to exhibtion data. We find that the geocodes are not always accurate, and that there are many distinct venues representing the same geocode. Further pre-processing will need to be conducted to ensure downstream trends can be accurately identified.

Below is a list of the top 100 geocodes, and the number of events they represent. Through further inspection we can see which geocodes accurate represent the place names.

Hide code cell source
most_freq_geocodes = dfs_rich[['latitude','longitude']].value_counts().reset_index().rename(columns={0:'Frequency'})
most_freq_geocodes = most_freq_geocodes.head(100)

# get the most frequent place_address for each geocode
for idx,row in most_freq_geocodes.iterrows():
    place_address = dfs_rich[(dfs_rich.latitude == row['latitude']) & (dfs_rich.longitude == row['longitude'])]['place_address'].value_counts().index[0]

    try: place_address2 = dfs_rich[(dfs_rich.latitude == row['latitude']) & (dfs_rich.longitude == row['longitude'])]['place_address'].value_counts().index[1]
    except: place_address2 = None
    
    try: place_address3 = dfs_rich[(dfs_rich.latitude == row['latitude']) & (dfs_rich.longitude == row['longitude'])]['place_address'].value_counts().index[2]
    except: place_address3 = None

    try: place_address4 = dfs_rich[(dfs_rich.latitude == row['latitude']) & (dfs_rich.longitude == row['longitude'])]['place_address'].value_counts().index[3]
    except: place_address4 = None

    try: place_address5 = dfs_rich[(dfs_rich.latitude == row['latitude']) & (dfs_rich.longitude == row['longitude'])]['place_address'].value_counts().index[4]
    except: place_address5 = None
    
    most_freq_geocodes.loc[idx,'most_frequent_address'] = place_address
    most_freq_geocodes.loc[idx,'2ndmost_frequent_address'] = place_address2
    most_freq_geocodes.loc[idx,'3rdmost_frequent_address'] = place_address3
    most_freq_geocodes.loc[idx,'4thmost_frequent_address'] = place_address4
    most_freq_geocodes.loc[idx,'5thmost_frequent_address'] = place_address5

# display data
show(most_freq_geocodes, scrollY="400px", scrollCollapse=True, scrollX=True,
     paging=False, showIndex=False, column_filters="footer", dom="tpr")
latitude longitude Frequency most_frequent_address 2ndmost_frequent_address 3rdmost_frequent_address 4thmost_frequent_address 5thmost_frequent_address
Loading... (need help?)
latitudelongitudeFrequencymost_frequent_address2ndmost_frequent_address3rdmost_frequent_address4thmost_frequent_address5thmost_frequent_address

Top 10 geocoded places with most events#

After omitting geocodes with random or erroneous place names, we can see that the “true” top 10 geocoded places with the most events. As expected these are all art galleries - all with over 100 events.

Venue

City

Frequency

Art Gallery of New South Wales

Sydney, NSW

292

Wollongong City Gallery

Wollongong, NSW

183

Museums and Art Galleries of the Northern Territory

Darwin, NT

128

National Gallery of Victoria

Melbourne, VIC

124

Queensland Art Gallery

Brisbane, QLD

113

Newcastle Region Art Gallery

Newcastle, NSW

106

Orange Regional Gallery

Orange, NSW

102

Warrnambool Art Gallery

Warrnambool, VIC

99

8 Llankelly Place, Kings Cross

Sydney, NSW

96

Art Gallery of Western Australia

Perth, WA

90

Data pre-processing#

We split the cleaning process into various parts. We discuss each part in detail below.

1. Aggregate place names with no geocode/address conflicts#

First, we aggregate place names with the same geocodes, however we only do this for geocodes with no conflicting places i.e., erroneous geocodes. We manually checked this for the top 100 geocodes.

Hide code cell source
qag_dict = dfs_rich[dfs_rich.place_address == 'Queensland Art Gallery, Brisbane, QLD'][['latitude','longitude']]\
            .head(1).T.iloc[:,0].to_list()

qvmag_dict = dfs_rich[dfs_rich.place_address == 'Queen Victoria Museum and Art Gallery, Launceston, TAS'][['latitude','longitude']]\
            .head(1).T.iloc[:,0].to_list()

fix_these_placenames = ['Art Gallery of New South Wales, Sydney, NSW',
 'National Gallery of Victoria, Melbourne, VIC',
 'Museums and Art Galleries of the Northern Territory, Darwin, NT',
 'Queensland Art Gallery, Brisbane, QLD', 
 'Queensland Museum, Brisbane, QLD',
 'Newcastle Region Art Gallery, Newcastle, NSW',
 'Orange Regional Gallery, Orange, NSW',
 'Warrnambool Art Gallery, Warrnambool, VIC',
 'Art Gallery of Western Australia, Perth, WA',
 'Queen Victoria Museum and Art Gallery, Launceston, TAS',
 'University Gallery, Launceston, Tas.',
 'Ballarat Fine Art Gallery, Ballarat, VIC',
 'Ian Potter Museum of Art, University of Melbourne, Melbourne, Vic.',
 'Heide Museum of Modern Art, Melbourne, VIC',
 'Art Gallery of South Australia, Adelaide, SA',
 'National Gallery of Australia, Canberra, ACT',
 'Tasmanian Museum and Art Gallery, Hobart, TAS',
 'Plimsoll Gallery, University of Tasmania, Hobart, TAS',
 'University of South Australia, Adelaide, SA',
 'Lawrence Wilson Art Gallery, University of Western Australia, Perth, WA',
 'University of Melbourne, Melbourne, Vic',
 'S.H. Ervin Gallery, Sydney, NSW',
 'Ace of Clubs Hall, Redcliffe, QLD',
 'NSW Parliament House, Sydney, NSW',
 'Campbelltown City Art Gallery, Campbelltown, NSW',
 'Ivan Dougherty Gallery, University of New South Wales, Sydney, NSW',
 'New England Regional Art Museum, Armidale, NSW',
 'Geelong Art Gallery, Geelong, VIC',
 'Bathurst Regional Art Gallery, Bathurst, NSW',
 'Bendigo Art Gallery, Bendigo, VIC',
 'Shanghai Art Gallery, Shanghai, China',
 'Glen Eira City Gallery, Melbourne, Vic.',
 'Penrith Regional Gallery, Penrith, NSW',
 'John Curtin Gallery, Curtin University of Technology, Perth, WA',
 'Tandanya National Aboriginal Cultural Institute, Adelaide, SA',
 'Mosman Art Gallery, Mosman, NSW',
 'Casula Powerhouse Arts Centre, Casula, NSW',
 'Tamworth Regional Gallery, Tamworth, NSW',
 'Royal Exhibition Building, Melbourne, VIC',
 'Gomboc Gallery, Perth, WA',
 'Albury Regional Art Centre, Albury, NSW',
 'Manly Art Gallery & Museum, Manly, Sydney, NSW',
 'Devonport Regional Gallery, Devonport, Tas.',
 'State Library of NSW, Sydney, NSW']

# iterate over fix_these_placenames and update place names of rows with matching geocodes
for place in fix_these_placenames:
    this_place = dfs_rich[dfs_rich.place_address == place].head(1)
    matching_geocodes = dfs_rich[(dfs_rich.latitude == this_place['latitude'].values[0]) &\
                                 (dfs_rich.longitude == this_place['longitude'].values[0])]
    
    # use index of matching_geocodes to replace place_address with place
    dfs_rich.loc[matching_geocodes.index,'place_address'] = place

dfs_rich.loc[dfs_rich.place_address == 'Queensland Museum, Brisbane, QLD',['latitude','longitude']] = qag_dict
dfs_rich.loc[dfs_rich.place_address == 'University Gallery, Launceston, Tas.',['latitude','longitude']] = qvmag_dict

# iterate over fix_these_placenames and update place names of rows with matching geocodes
for place in ['Queensland Art Gallery, Brisbane, QLD','Queen Victoria Museum and Art Gallery, Launceston, TAS']:
    this_place = dfs_rich[dfs_rich.place_address == place].head(1)
    matching_geocodes = dfs_rich[(dfs_rich.latitude == this_place['latitude'].values[0]) &\
                                 (dfs_rich.longitude == this_place['longitude'].values[0])]
    
    # use index of matching_geocodes to replace place_address with place
    dfs_rich.loc[matching_geocodes.index,'place_address'] = place

2. Cleaning by aggregating by geocode#

Next, we use the geopy library to convert the place names into geocodes. This helps us identify erroneous place names, and also helps us identify duplicate venues. There are only 33% of venues that have a geocode attached to them - these are handled accordingly keeping only venues with more than 4 exhibitions.

Hide code cell source
# import geopandas # !pip install geopandas
# import geopy # !pip install geopy
# from geopy.geocoders import Nominatim
# from geopy.extra.rate_limiter import RateLimiter

# venue_addresses = pd.DataFrame(dfs_rich['place_address'].unique()).rename(columns={0:'address_prompt'})

# # 40 min process
# locator = Nominatim(user_agent="myGeocoder")

# # 1 - conveneint function to delay between geocoding calls
# geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

# # 2- - create location column
# venue_addresses['location'] = venue_addresses['address_prompt'].apply(geocode)

# # 3 - create longitude, laatitude and altitude from location column (returns tuple)
# venue_addresses['point'] = venue_addresses['location'].apply(lambda loc: tuple(loc.point) if loc else None)

# # 4 - split point column into latitude, longitude and altitude columns
# venue_addresses[['latitude2', 'longitude2', 'altitude2']] = pd.DataFrame(venue_addresses['point'].tolist(), index=venue_addresses.index)

# # save csv
# venue_addresses.to_csv('data/DAAO_venue_addresses_geocoded.csv',index=False)

# read csv
# venue_addresses = pd.read_csv('data/local/DAAO_venue_addresses_geocoded.csv')
venue_addresses = fetch_small_data_from_github('DAAO_venue_addresses_geocoded.csv')

print(f'{round(venue_addresses["point"].notnull().sum()/venue_addresses.shape[0],2)*100}% of the addresses could be geocoded.')
33.0% of the addresses could be geocoded.
Hide code cell source
# join the geocoded addresses to the dfs_rich
dfs_rich2 = dfs_rich.merge(venue_addresses,how='left',left_on='place_address',right_on='address_prompt')
dfs_rich2.loc[dfs_rich2['place_address'].str.contains('University of South Australia',na=False),'location':] = np.nan # fix one address
# print(f'{round(dfs_rich2[dfs_rich2["latitude2"].notnull()].shape[0]/dfs_rich2.shape[0],2)*100}% of the events could be geocoded.')

dfs_rich_uncoded = dfs_rich2[(dfs_rich2["latitude2"].isnull()) | (dfs_rich2["address_prompt"].isnull())].copy()
dfs_rich_coded = dfs_rich2[(dfs_rich2["latitude2"].notnull()) & (dfs_rich2["address_prompt"].notnull())].copy()
# dfs_rich_coded = dfs_rich2[(dfs_rich2["latitude2"].notnull()) & (dfs_rich2["address_prompt"].notnull())].copy()[['place_address',
#                                                                      'decade_start',
#                                                                      'latitude','longitude',
#                                                                      'address_prompt',
#                                                                      'location','point',
#                                                                      'latitude2','longitude2']]

# keep rows with Frequency over 4
dfs_rich_coded_counts = dfs_rich_coded[['point']].value_counts().reset_index().rename(columns={0:'Frequency'})
unique_points = dfs_rich_coded_counts[dfs_rich_coded_counts['Frequency'] > 4]['point'].values
dfs_rich_coded_top = dfs_rich_coded[dfs_rich_coded['point'].isin(unique_points)].copy()

# keep rows with Frequency over 4
dfs_rich_uncoded_counts = dfs_rich_uncoded[['latitude','longitude']].value_counts().reset_index().rename(columns={0:'Frequency'})
unique_longlats = dfs_rich_uncoded_counts[dfs_rich_uncoded_counts['Frequency'] > 4][['latitude','longitude']].values
dfs_rich_uncoded_top = dfs_rich_uncoded[dfs_rich_uncoded['latitude'].isin(unique_longlats[:,0]) & dfs_rich_uncoded['longitude'].isin(unique_longlats[:,1])].copy()
Hide code cell source
verified_df = pd.DataFrame()

ignore_these = ['Melbourne, Vic.','Brisbane, Qld','Sydney, NSW','Fremantle, WA','Hobart, TAS','Adelaide, SA',
                'Berlin, Germany','Paris, France','London, England, UK','New York, USA','Gold Coast, QLD','Perth, WA',
                'Newcastle, NSW','Auckland, NZ','Bondi, Sydney, NSW','Canberra, ACT','Launceston, TAS']

for idx, row in enumerate(dfs_rich_coded_top.point.unique()):
    thisrow = dfs_rich_coded_top[dfs_rich_coded_top.point == row].copy()
    topcount = thisrow['address_prompt'].value_counts().reset_index().iloc[0][0]
    if topcount not in ignore_these: verified_df = pd.concat([verified_df,thisrow],axis=0)

# create a new dataframe for each unique address with matches
verified_df1 = pd.DataFrame()

for thispoint in dfs_rich_coded_top[dfs_rich_coded_top.index.isin(verified_df.index)]['point'].unique():
    # get all rows with address in all_addresses
    thisrow = dfs_rich_coded_top[dfs_rich_coded_top['point'] == thispoint].copy()

    # get the most common response
    thisrow['address_prompt'] = thisrow['address_prompt'].value_counts().index[0]

    # concat to verified2
    verified_df1 = pd.concat([verified_df1,thisrow],axis=0)

verified_df1 = verified_df1.drop_duplicates()
print('Below are the top 10 most common place names for the verified addresses (after cleaning part 1): \n')
verified_df1['address_prompt'].value_counts().head(10)
Below are the top 10 most common place names for the verified addresses (after cleaning part 1): 
Art Gallery of New South Wales, Sydney, NSW     292
Institute of Modern Art, Brisbane, QLD          218
Queensland Art Gallery, Brisbane, QLD           126
National Gallery of Victoria, Melbourne, VIC    124
Orange Regional Gallery, Orange, NSW            102
Warrnambool Art Gallery, Warrnambool, VIC        99
Art Gallery of Western Australia, Perth, WA      91
Ballarat Fine Art Gallery, Ballarat, VIC         83
Artspace, Sydney, NSW                            79
Heide Museum of Modern Art, Melbourne, VIC       72
Name: address_prompt, dtype: int64

3. Cleaning by edit distance#

The remaning venues are cleaned using edit distancing. If two place names share more than 75% common characters, then they are considered the same place. This is a very conservative threshold, but it is necessary to ensure that we do not merge two distinct places. Again we only keep venues with more than 4 exhibitions.

Hide code cell source
# use pairwise distancing to find similar addresses using only address_prompt
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer

unique_addresses = dfs_rich_uncoded_top[dfs_rich_uncoded_top['address_prompt'].notnull()]['address_prompt'].unique()

# conver to lower case
unique_addresses2 = [x.lower() for x in unique_addresses]

# remove commas and periods
unique_addresses2 = [x.replace(',','') for x in unique_addresses2]
unique_addresses2 = [x.replace('.','') for x in unique_addresses2]

# remove_these_terms = ['nsw', 'sydney', 'melbourne', 'vic', 'qld', 'brisbane', 
#                       'adelaide', 'new south wales', 'perth', 'canberra', 'australia', 'paddington',
#                       'tas','act','queensland','hobart']

# remove strings in this list, remove_these_terms
# unique_addresses2 = [' '.join([word for word in x.split() if word not in remove_these_terms]) for x in unique_addresses2]

unique_addresses = pd.DataFrame([unique_addresses, unique_addresses2]).T

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(unique_addresses2)
distances = pairwise_distances(tfidf_matrix, metric='cosine')

# get pairs with threshold less than 0.5
pairs = np.argwhere(distances < 0.25)
pairs = [(unique_addresses2[x[0]], unique_addresses2[x[1]]) for x in pairs]

# get unique pairs
unique_pairs = []
for pair in pairs:
    if pair[0] != pair[1] and pair not in unique_pairs and (pair[1], pair[0]) not in unique_pairs:
        unique_pairs.append(pair)

# print(len(unique_pairs))

unique_pairs_df = pd.DataFrame(unique_pairs)
unique_pairs_df.columns = ['address1','address2']

# create a new dataframe for each unique address1
# each unique address1 should have a corresponding list of address2s

# create a dictionary of address1 and address2s
address_dict = dict()
for idx, row in unique_pairs_df.iterrows():
    if row['address1'] not in address_dict: address_dict[row['address1']] = [row['address2']]
    else: address_dict[row['address1']].append(row['address2'])

# create a dataframe of address1 and address2s
address_df = pd.DataFrame.from_dict(address_dict, orient='index').reset_index()

# rename columns
address_df.columns = ['address1','address2','address3','address4','address5']

# merge with unique_addresses
address_df = unique_addresses.merge(address_df, left_on=1, right_on='address1', how='left')
address_df['all_addresses'] = address_df.apply(lambda x: [x['address1'], x['address2'], x['address3'], x['address4'], x['address5']], axis=1)
address_df['all_addresses'] = address_df['all_addresses'].apply(lambda x: [y for y in x if str(y) != 'nan'])
address_df['all_addresses'] = address_df['all_addresses'].apply(lambda x: list(set(x)))

address_dict = dict()
for idx, row in address_df.iterrows(): address_dict[row[1]] = row[0]

# replace all data from address1 to address5 according to address_dict
for idx, row in address_df.iterrows():
    for col in ['address1','address2','address3','address4','address5']:
        if row[col] is not None:
            if str(row[col]) != 'nan': address_df.loc[idx,col] = address_dict[row[col]]

# create a list column in address_df which stores all data from address1 to address5 without nulls
address_df['all_addresses'] = address_df.apply(lambda x: [x['address1'], x['address2'], x['address3'], x['address4'], x['address5']], axis=1)
address_df['all_addresses'] = address_df['all_addresses'].apply(lambda x: [y for y in x if y is not None])
address_df['all_addresses'] = address_df['all_addresses'].apply(lambda x: [y for y in x if str(y) != 'nan'])
address_df = address_df[[0,'all_addresses']]

# create a new dataframe for each unique address with matches
verified_df2 = pd.DataFrame()

for idx,row in address_df.iterrows():
    if len(row['all_addresses']) > 0:
        # get all rows with address in all_addresses
        thisrow = dfs_rich_uncoded_top[dfs_rich_uncoded_top['address_prompt'].isin(row['all_addresses'])].copy()

        if thisrow.shape[0] > 4:
            # get the most common response
            thisrow['address_prompt'] = thisrow['address_prompt'].value_counts().index[0]

            # concat to verified2
            verified_df2 = pd.concat([verified_df2,thisrow],axis=0)

# verified_df2 = verified_df2.drop_duplicates()

# merge back with original dataframe
v_df2_cols = dfs_rich_uncoded_top.columns.tolist()
v_df2_cols.remove('address_prompt')
verified_df2 = pd.concat([dfs_rich_uncoded_top, verified_df2],axis=0).drop_duplicates(subset=v_df2_cols)

# get the most common response
print('Below are the top 10 most common place names for the verified addresses (after cleaning part 2):: \n')
verified_df2['address_prompt'].value_counts().head(10)
Below are the top 10 most common place names for the verified addresses (after cleaning part 2):: 
Experimental Art Foundation, Adelaide, SA                             424
Contemporary Art Centre of South Australia, Adelaide, SA              341
Wollongong City Gallery, Wollongong, NSW                              165
Museums and Art Galleries of the Northern Territory, Darwin, NT       128
Newcastle Region Art Gallery, Newcastle, NSW                          106
8 Llankelly Place, Kings Cross, Sydney, 2011                           96
Queen Victoria Museum and Art Gallery, Launceston, TAS                 96
Ian Potter Museum of Art, University of Melbourne, Melbourne, Vic.     82
Tin Sheds Gallery, University of Sydney, Sydney, NSW                   80
Brisbane City Art Gallery, Brisbane, Qld                               68
Name: address_prompt, dtype: int64

4. Final cleaning phase and removing duplicates#

The final phase consists of merging the two cleaned datasets, and removing any duplicate venues. We also remove any venues with less than 5 exhibitions. The final data consists of 43% of data modified by geocode aggregation and 57% of data modified by edit distance.

Hide code cell source
verified_df1['coded'] = True
verified_df2['coded'] = False

clean_data_v1 = pd.concat([verified_df1,verified_df2],axis=0)
clean_data_v1 = clean_data_v1.drop_duplicates(subset=v_df2_cols)

# use pairwise distancing to find similar addresses using only address_prompt
unique_addresses = clean_data_v1[clean_data_v1['address_prompt'].notnull()]['address_prompt'].unique()

# conver to lower case
unique_addresses2 = [x.lower() for x in unique_addresses]

# remove commas and periods
unique_addresses2 = [x.replace(',','') for x in unique_addresses2]
unique_addresses2 = [x.replace('.','') for x in unique_addresses2]
unique_addresses = pd.DataFrame([unique_addresses, unique_addresses2]).T

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(unique_addresses2)
distances = pairwise_distances(tfidf_matrix, metric='cosine')

# get pairs with threshold less than 0.5
pairs = np.argwhere(distances < 0.15)
pairs = [(unique_addresses2[x[0]], unique_addresses2[x[1]]) for x in pairs]

# get unique pairs
unique_pairs = []
for pair in pairs:
    if pair[0] != pair[1] and pair not in unique_pairs and (pair[1], pair[0]) not in unique_pairs:
        unique_pairs.append(pair)

# print(len(unique_pairs))

unique_pairs_df = pd.DataFrame(unique_pairs)
unique_pairs_df.columns = ['address1','address2']

# create a new dataframe for each unique address1
# each unique address1 should have a corresponding list of address2s

# create a dictionary of address1 and address2s
address_dict = dict()
for idx, row in unique_pairs_df.iterrows():
    if row['address1'] not in address_dict: address_dict[row['address1']] = [row['address2']]
    else: address_dict[row['address1']].append(row['address2'])

# create a dataframe of address1 and address2s
address_df = pd.DataFrame.from_dict(address_dict, orient='index').reset_index()

# rename columns
address_df.columns = ['address1','address2','address3','address4']

# merge with unique_addresses
address_df = unique_addresses.merge(address_df, left_on=1, right_on='address1', how='left')
address_df['all_addresses'] = address_df.apply(lambda x: [x['address1'], x['address2'], x['address3'], x['address4']], axis=1)
address_df['all_addresses'] = address_df['all_addresses'].apply(lambda x: [y for y in x if str(y) != 'nan'])
address_df['all_addresses'] = address_df['all_addresses'].apply(lambda x: list(set(x)))

address_dict = dict()
for idx, row in address_df.iterrows(): address_dict[row[1]] = row[0]

# replace all data from address1 to address5 according to address_dict
for idx, row in address_df.iterrows():
    for col in ['address1','address2','address3','address4']:
        if row[col] is not None:
            if str(row[col]) != 'nan': address_df.loc[idx,col] = address_dict[row[col]]

# create a list column in address_df which stores all data from address1 to address5 without nulls
address_df['all_addresses'] = address_df.apply(lambda x: [x['address1'], x['address2'], x['address3'], x['address4']], axis=1)
address_df['all_addresses'] = address_df['all_addresses'].apply(lambda x: [y for y in x if y is not None])
address_df['all_addresses'] = address_df['all_addresses'].apply(lambda x: [y for y in x if str(y) != 'nan'])
address_df = address_df[[0,'all_addresses']]

# update dataframe with new address_prompt
clean_data_v2 = clean_data_v1.copy()

for idx,row in address_df.iterrows():
    if len(row['all_addresses']) > 0:
        # get all rows with address in all_addresses
        thisrow = clean_data_v2[clean_data_v2['address_prompt'].isin(row['all_addresses'])].copy()

        if thisrow.shape[0] > 4:
            # get the most common response
            clean_data_v2.loc[clean_data_v2['address_prompt'].isin(row['all_addresses']),'address_prompt'] = thisrow['address_prompt'].value_counts().index[0]

# merge back with original dataframe
clean_data_v2 = pd.concat([clean_data_v1, clean_data_v2],axis=0).drop_duplicates(subset=v_df2_cols)
clean_data_v2 = clean_data_v2[clean_data_v2['address_prompt'].isin(clean_data_v2['address_prompt'].value_counts()[clean_data_v2['address_prompt'].value_counts() > 4].index)]
clean_data_v2 = clean_data_v2.drop_duplicates()

print (f'At this point, we have {clean_data_v2.sort_values("address_prompt")["address_prompt"].nunique()} unique venues.')
At this point, we have 131 unique venues.

Through manual inspection, we identified some duplicate venues that were not picked up by our pre-processing. We fix these manually.

Hide code cell source
ngv_dict = clean_data_v2[clean_data_v2.address_prompt == 'National Gallery of Victoria, Melbourne, VIC'][['location','point','latitude2','longitude2','altitude2','coded']]\
            .head(1).T.iloc[:,0].to_list()

monash_dict = clean_data_v2[clean_data_v2.address_prompt == 'Monash University Museum of Art, Melbourne, VIC'][['location','point','latitude2','longitude2','altitude2','coded']]\
            .head(1).T.iloc[:,0].to_list()

ervin_dict = clean_data_v2[clean_data_v2.address_prompt == 'S.H. Ervin Gallery, Sydney, NSW'][['location','point','latitude2','longitude2','altitude2','coded']]\
            .head(1).T.iloc[:,0].to_list()

tmag_dict = clean_data_v2[clean_data_v2.address_prompt == 'Tasmanian Museum and Art Gallery, Hobart, TAS'][['location','point','latitude2','longitude2','altitude2','coded']]\
            .head(1).T.iloc[:,0].to_list()

nas_dict = clean_data_v2[clean_data_v2.address_prompt == 'National Art School, Sydney, NSW'][['location','point','latitude2','longitude2','altitude2','coded']]\
            .head(1).T.iloc[:,0].to_list()

camcag_dict = clean_data_v2[clean_data_v2.address_prompt == 'Campbelltown City Art Gallery, Campbelltown, NSW'][['location','point','latitude2','longitude2','altitude2','coded']]\
            .head(1).T.iloc[:,0].to_list()


duplicate_dict = dict({'Ivan Dougherty Gallery, COFA, UNSW, Paddington, NSW': 'Ivan Dougherty Gallery, Sydney, NSW',
                       'Ivan Dougherty Gallery, University of New South Wales, Sydney, NSW': 'Ivan Dougherty Gallery, Sydney, NSW',
                       'Christine Abrahams Gallery, Melbourne, Vic.': 'Christine Abrahams Gallery, Melbourne, VIC',
                       'Gallery Gabrielle Pizzi, Melbourne, Vic.':'Gallery Gabrielle Pizzi, Melbourne, VIC',
                       'Tin Sheds Gallery, Sydney, NSW':'Tin Sheds Gallery, University of Sydney, Sydney, NSW',
                       'David Jones Gallery, Sydney, NSW':'David Jones Art Gallery, Sydney, NSW',
                       'Monash University Gallery, Melbourne, VIC':'Monash University Museum of Art, Melbourne, VIC',
                       'Flinders University City Gallery, Adelaide, SA':'Flinders University Art Museum, Adelaide, SA',
                       'Hogarth Galleries, Sydney, NSW':'Hogarth Gallery, Sydney, NSW',
                       'Wollongong City Gallery, Wollongong, NSW':'Wollongong City Art Gallery, Wollongong, NSW',
                       'National Art School Gallery, Sydney, NSW':'National Art School, Sydney, NSW',
                       'Mary Place Gallery, Sydney, NSW':'Mary Place Gallery, Paddington, NSW'
                       })

# replace address_prompt using duplicate_dict
clean_data_v2['address_prompt'] = clean_data_v2['address_prompt'].replace(duplicate_dict)

# replace all rows with address_prompt == 'National Gallery of Victoria, Melbourne, VIC' with ngv_dict
clean_data_v2.loc[clean_data_v2.address_prompt == 'National Gallery of Victoria, Melbourne, VIC',['location','point','latitude2','longitude2','altitude2','coded']] = ngv_dict
clean_data_v2.loc[clean_data_v2.address_prompt == 'Monash University Museum of Art, Melbourne, VIC',['location','point','latitude2','longitude2','altitude2','coded']] = monash_dict
clean_data_v2.loc[clean_data_v2.address_prompt == 'S.H. Ervin Gallery, Sydney, NSW',['location','point','latitude2','longitude2','altitude2','coded']] = ervin_dict
clean_data_v2.loc[clean_data_v2.address_prompt == 'Tasmanian Museum and Art Gallery, Hobart, TAS',['location','point','latitude2','longitude2','altitude2','coded']] = tmag_dict
clean_data_v2.loc[clean_data_v2.address_prompt == 'National Art School, Sydney, NSW',['location','point','latitude2','longitude2','altitude2','coded']] = nas_dict
clean_data_v2.loc[clean_data_v2.address_prompt == 'Campbelltown City Art Gallery, Campbelltown, NSW',['location','point','latitude2','longitude2','altitude2','coded']] = camcag_dict

clean_data_v2 = clean_data_v2.drop_duplicates()
print (f'After removing duplicates, we now have {clean_data_v2.sort_values("address_prompt")["address_prompt"].nunique()} unique venues.')
print('Below are the top 20 most common place names for the verified addresses (after final cleaning): \n')
clean_data_v2.address_prompt.value_counts().head(20)
After removing duplicates, we now have 119 unique venues.
Below are the top 20 most common place names for the verified addresses (after final cleaning): 
Experimental Art Foundation, Adelaide, SA                             424
Contemporary Art Centre of South Australia, Adelaide, SA              341
Art Gallery of New South Wales, Sydney, NSW                           292
Institute of Modern Art, Brisbane, QLD                                218
Wollongong City Art Gallery, Wollongong, NSW                          171
Museums and Art Galleries of the Northern Territory, Darwin, NT       128
Queensland Art Gallery, Brisbane, QLD                                 126
National Gallery of Victoria, Melbourne, VIC                          124
Newcastle Region Art Gallery, Newcastle, NSW                          106
Orange Regional Gallery, Orange, NSW                                  102
Warrnambool Art Gallery, Warrnambool, VIC                              99
Queen Victoria Museum and Art Gallery, Launceston, TAS                 96
8 Llankelly Place, Kings Cross, Sydney, 2011                           96
Art Gallery of Western Australia, Perth, WA                            91
Tin Sheds Gallery, University of Sydney, Sydney, NSW                   88
Ballarat Fine Art Gallery, Ballarat, VIC                               83
Ian Potter Museum of Art, University of Melbourne, Melbourne, Vic.     82
Artspace, Sydney, NSW                                                  79
Heide Museum of Modern Art, Melbourne, VIC                             72
Brisbane City Art Gallery, Brisbane, Qld                               68
Name: address_prompt, dtype: int64

Compare this with original table:

Venue

City

Frequency

Art Gallery of New South Wales

Sydney, NSW

292

Wollongong City Gallery

Wollongong, NSW

183

Museums and Art Galleries of the Northern Territory

Darwin, NT

128

National Gallery of Victoria

Melbourne, VIC

124

Queensland Art Gallery

Brisbane, QLD

113

Newcastle Region Art Gallery

Newcastle, NSW

106

Orange Regional Gallery

Orange, NSW

102

Warrnambool Art Gallery

Warrnambool, VIC

99

8 Llankelly Place, Kings Cross

Sydney, NSW

96

Art Gallery of Western Australia

Perth, WA

90

Extra categories#

Before we continue with our NLP analysis, we also add some extra categories to the data. These are manually added based on the venue names.

Hide code cell source
# read curated list 
venues_checked = fetch_small_data_from_github("DAAO_venues_list_with_categories.csv")

# manually create a State column
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains('QLD') | venues_checked['venue_name'].str.contains('Qld'), 'QLD', np.nan)
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains('NT'), 'NT', venues_checked['State'])
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains('ACT'), 'ACT', venues_checked['State'])
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains('Sydney') | venues_checked['venue_name'].str.contains('NSW'), 'NSW', venues_checked['State'])
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains(' SA') | venues_checked['venue_name'].str.contains('South Australia'), 'SA', venues_checked['State'])
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains('WA'), 'WA', venues_checked['State'])
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains('VIC') | venues_checked['venue_name'].str.contains('Vic'),'VIC', venues_checked['State'])
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains('Tas') | venues_checked['venue_name'].str.contains('TAS'),'TAS', venues_checked['State'])
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains('USA'), 'USA', venues_checked['State'])
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains('China'), 'China', venues_checked['State'])
venues_checked['State'] = np.where(venues_checked['venue_name'].str.contains('Griffith'), 'QLD', venues_checked['State'])

# change address_prompt two match curated venue list
clean_data_v2.loc[clean_data_v2.address_prompt.str.contains('321 Blaxland Rd Wentworth Falls'), 'address_prompt'] = '321 Blaxland Rd Wentworth Falls, Sydney'
clean_data_v2.loc[clean_data_v2.address_prompt.str.contains('282 Petrie Terrace'), 'address_prompt'] = '282 Petrie Terrace, Brisbane QLD'

# merge venues_checked with clean_data_v2
clean_data_v2 = clean_data_v2.merge(venues_checked, how='left', left_on='address_prompt', right_on='venue_name')

########### Unique venue frequency by major category ###########
venues_checked_count=pd.DataFrame(dict(Counter(venues_checked["venue_category_major"])).items(),
                              columns=["Venue Category","Frequency"])

# explosion
explode = (0.05, 0.05, 0.05, 0.05)

# Pie Chart
plt.pie(venues_checked_count['Frequency'], 
        labels = venues_checked_count['Venue Category'],
        autopct='%1.1f%%', pctdistance=0.85,
        explode=explode, colors=['#1f77b4','#ff7f0e','#2ca02c','#d62728'])
  
# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
  
# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)
  
# Adding Title of chart
plt.title('Unique count of DAAO venues (n=119), Major categories')

# Displaying Chart
plt.show()

########### Exhibition frequency by major category ###########
venues_checked_count2=pd.DataFrame(dict(Counter(clean_data_v2["venue_category_major"])).items(),
                              columns=["Venue Category","Frequency"])

# explosion
explode = (0.05, 0.05, 0.05, 0.05)
  
# Pie Chart
plt.pie(venues_checked_count2['Frequency'], 
        labels = venues_checked_count2['Venue Category'],
        autopct='%1.1f%%', pctdistance=0.85,
        explode=explode, colors=['#d62728','#1f77b4','#2ca02c','#ff7f0e'])

# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()

# # make figure larger
# fig.set_size_inches(6,5.75)
  
# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)

# Adding Title of chart
plt.title('Count of DAAO venue activity (n=4,522), Major categories')
  
# Displaying Chart
plt.show()
_images/a5a59d2701bb5a78a13ba14056077090690e338583cedab479c17447a02027af.png _images/564097b5aa87c45a1d9d449bc61bed14ffde2f828aed1be5138304085b45a668.png
Hide code cell source
########### Unique venue frequency by State ###########
ax = venues_checked['State'].value_counts().plot(kind='bar', figsize=(10, 6), rot=0) # rotate x-labels
for p in ax.patches: ax.annotate(str(round(p.get_height()/len(venues_checked), 2)), (p.get_x() + 0.065, p.get_height() + 1.015))

# make y-axis higher 
plt.ylim(0, 65)

plt.title('Count (and proportion) of DAAO venues by State/Country (n=119)')
plt.show()

########### Exhibition frequency by State ###########
ax = clean_data_v2['State'].value_counts().plot(kind='bar', figsize=(10, 6), rot=0) # rotate x-labels

for p in ax.patches: ax.annotate(str(round(p.get_height()/len(clean_data_v2), 2)), (p.get_x() + 0.075, p.get_height() + 20))

# make y-axis higher 
plt.ylim(0, 1750)

plt.title('Count (and proportion) of DAAO venue activity by State/Country (n=4,522)')
plt.show()
_images/e4486c505cbc80b10ed9247bd6da6efae261ae1411a7aec8ddf7cc349cd0fb9e.png _images/c08298476c20e617230029307b892c2f7753a825f03ce5449a01d8bcd2cedb1e.png

Most frequent terms used for place names#

From here, we explore the most frequent terms used for place names. We find that the most frequent terms are “Gallery”, “Art”, “Centre”, “Museum”, and “University”.

Hide code cell source
# find the most frequent words across the strings for place names
from collections import Counter
import matplotlib.pyplot as plt

import nltk
# nltk.download('punkt')

from nltk.tokenize import word_tokenize

# create a list of all the place names
all_place_names = clean_data_v2['address_prompt'].astype(str).tolist()

# create a list of all the words in the place names
all_words = []
for place_name in all_place_names:
    all_words.extend(word_tokenize(place_name))

# find top 100 most frequent words
most_freq_words = Counter(all_words).most_common(1000)

# remove a list of all words that are not relevant
words_to_remove = [',','NSW','Sydney','Melbourne','Adelaide','South','SA','Brisbane','VIC',
                   '.', 'New', 'Australia', 'QLD', 'Vic', 'Wales', 'WA', 'Canberra', 'and',
                   'Perth', 'ACT', 'of', 'Qld', 'Victoria','Wollongong','TAS','Queensland','Newcastle',
                   'Street','Hobart','the','The','Launceston','Orange','UK','NT','London','USA',
                   'Paddington','Darwin','for','Western','Warrnambool','Ballarat','Northern','Territory',
                   'England','Watters','Macquarie','Artspace','St',"'s",'&','Potter','Kings','Ian','Cross',
                   '8','Llankelly','2011','Fremantle','Queen','Ivan','Dougherty','Tasmania','Central','Tamar',
                   'Curtin','France','Tin','Sheds','York','Monash','Paris','Heide','Tasmanian','Sherman','Campbelltown']

# remove the words from the list of most frequent words
most_freq_words = [word for word in most_freq_words if word[0] not in words_to_remove]

most_freq_words_dict = dict(most_freq_words)
# keep words in most_freq_words_dict with at least 50 occurrences
most_freq_words_dict = {k: v for k, v in most_freq_words_dict.items() if v > 50}

# add value of two keys
most_freq_words_dict['Gallery'] = most_freq_words_dict['Gallery'] + most_freq_words_dict['Galleries']
# most_freq_words_dict['Museum'] = most_freq_words_dict['Museum'] + most_freq_words_dict['Museums']

# remove key 'Gallery'
most_freq_words_dict.pop('Galleries')
# most_freq_words_dict.pop('Museums')
most_freq_words_dict2 = most_freq_words_dict.copy()

# create a wordcloud with the most frequent words
from wordcloud import WordCloud

# control random state so that the same words appear each time
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10, random_state=100).generate_from_frequencies(most_freq_words_dict2)

# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()
_images/69fbcf91da7f49a8c1a23893aea3c47bfe7ee34214e4109044fc4c024013f5b9.png

Hierarchical clustering using event description data#

Next we explore exhibition descriptions using hierarchical clustering; a method used to group similar objects into clusters that follow a hierarchical structure. This can help conceptualise what a taxonomy of venue categories might look like. We use Google’s BERT to embed the data.

Hierarchical clustering (BERT)#

The first dendrogram shows the hierarchical clustering of the exhibition descriptions using a BERT encoder. We cluster the data into five groups. The x-axis provides a count (in brackets) of the exhibitition within each respective cluster.

We provide three sets of annotations. The annotations above the x-axis (in red) represent the most frequently occurring terms within the event description for each cluster. There is also a second set of frequently occurring terms that follow in black, however these are filteered to only contain distinctive terms to get a better sense of the differences between clusters. Lastly, the annotations below the x-axis represent the most frequently occuring venues for each cluster.

Hide code cell source
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, fcluster

# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

### collect all relevant textual data from the dataframe
# if description is not \n then append to slug2 in one line
clean_data_v2['slug2'] = clean_data_v2['title'].fillna('') + clean_data_v2['description'].apply(lambda x: '' if x == '\n' else x)

### pre-process for NLP
# Load the documents and their corresponding categorical variables into a Pandas dataframe
df = pd.DataFrame({'text': clean_data_v2['slug2'], 'category': clean_data_v2['address_prompt']})

# summarise text for each unique place name
df['text'] = df.groupby('category')['text'].transform(lambda x: ' '.join(x))

#add new column with count for each category
df['cat_count'] = df.groupby('category')['category'].transform('count')
df.drop_duplicates(inplace=True)

# Clean the text
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.replace(',', '')
    text = text.lower().split()
    text = [word for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

df = df[df['text'].notnull()]
df['clean_text'] = df['text'].apply(clean_text)

### Generate clusters

def get_most_common_word(df, more_words = [], no_of_words=25):
    # remove a list of all words that are not relevant
    # words_to_remove = [',','NSW','Sydney','Melbourne','Adelaide','South','SA','Brisbane','VIC',
    #                 '.', 'New', 'Australia', 'QLD', 'Vic', 'Wales', 'WA', 'Canberra', 'and',
    #                 'Perth', 'ACT', 'of', 'Qld', 'Victoria','Wollongong','TAS','Queensland','Newcastle',
    #                 'Street','Hobart','the','The','Launceston','Orange','UK','NT','London','USA',
    #                 'Paddington','Darwin','for','Western','Warrnambool','Ballarat','Northern','Territory',
    #                 'England','Watters','Macquarie','Artspace','St',"'s",'&','Potter','Kings','Ian','Cross',
    #                 '8','Llankelly','2011','Fremantle','Queen','Ivan','Dougherty','Tasmania','Central',
    #                 'Curtin','France','Tin','Sheds','York','Monash','Paris','Heide','Place','Vic.',]
    
    words_to_remove = []
    
    # add more words to the list of words to remove
    words_to_remove = words_to_remove + more_words

    all_words = []
    for i in df:
        for j in i.split(' '):
            all_words.append(j.replace(',',''))

    # find the most common words
    most_common_words = Counter(all_words).most_common(no_of_words)

    # remove the words from the list of most frequent words
    most_common_words = [word for word in most_common_words if word[0] not in words_to_remove]
    return most_common_words

def get_linkage_matrix(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    return linkage_matrix
Hide code cell source
# ### load bert model

# # !pip install transformers
# from transformers import BertTokenizer, BertModel
# from transformers import pipeline
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained("bert-base-uncased")

# ### encode text using bert (takes 15 seconds to run)
# import random
# import pickle

# # encode the text using bert
# def bert_encode(x):
#   encoded_input = tokenizer(x, return_tensors='pt')
#   output = model(**encoded_input)
#   return pd.DataFrame(output['pooler_output'].detach().numpy()).T

# # randomly sample 512 tokens from each row in df['clean_text']
# # some strings are smalle than 512
# df['clean_text_sampled'] = df['clean_text'].apply(lambda x: ' '.join(random.sample(x.split(' '), 300)) if len(x.split(' ')) >= 300 else x)
# X_bert = df['clean_text_sampled'].apply(lambda x: pd.Series(bert_encode([str(x)])[0]))

# # setting distance_threshold=0 ensures we compute the full tree.
# model_bert = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
# model_bert = model_bert.fit(np.array(X_bert))

# # save model as pickle
# pickle.dump(model_bert, open('models/model_bert.pkl', 'wb'))
Hide code cell source
from textwrap import wrap

# read model from pickle
import pickle

# if a word is a duplicate then remove from all_words
def find_duplicates(all_words, occurences=1):
    duplicates = []
    non_duplicates = []

    for i in all_words:
        if i in duplicates: continue
        else:
            if all_words.count(i) > occurences: duplicates.append(i)
            else: non_duplicates.append(i)
    return duplicates

def fetch_bert_models_from_github(fname):
    url = f"https://raw.githubusercontent.com/acd-engine/jupyterbook/master/data/analysis/models/{fname}"
    response = requests.get(url)
    rawdata = response.content
    return pickle.loads(rawdata)

model_bert = fetch_bert_models_from_github("model_bert.pkl")

# cut = 5
# l_matrix = get_linkage_matrix(model_bert)
# df['cluster'] = fcluster(l_matrix, cut, criterion='maxclust')
# dendrogram(l_matrix, orientation='top', truncate_mode="lastp", p=cut, show_leaf_counts=True)

# all_words = []

# for i in df['cluster'].unique():
#     cluster_docs = df[df['cluster'] == i]
#     # print(i, get_most_common_word(cluster_docs['clean_text']))

#     annot = "\n".join(i[0] for idx,i in enumerate(get_most_common_word(cluster_docs['clean_text'])) if (idx < 3))
    
#     plt.annotate(annot, xy=(i/df['cluster'].nunique()-0.1, 0.15), 
#                  xytext=(i/df['cluster'].nunique()-0.1, 0.15), 
#                  xycoords='axes fraction', fontsize=9, color='red')

#     [all_words.append(i[0]) for idx,i in enumerate(get_most_common_word(cluster_docs['clean_text']))]
    
# all_words_to_remove = find_duplicates(all_words, occurences=2)
# all_words_to_remove.extend(['j','n','r','th','nd','exhibitionexhibited','http','www','isbn'])

# for i in df['cluster'].unique():
#     cluster_docs = df[df['cluster'] == i]
#     # print(i, get_most_common_word(cluster_docs['clean_text']))
#     annot = "\n".join(i[0] for idx,i in enumerate(get_most_common_word(cluster_docs['clean_text'],
#                                                                        more_words=all_words_to_remove)) if (idx < 5))
    
#     plt.annotate(annot, xy=(i/df['cluster'].nunique()-0.1, 0.025), 
#                  xytext=(i/df['cluster'].nunique()-0.1, 0.025), 
#                  xycoords='axes fraction', fontsize=9)
    
#     annot2 = cluster_docs.sort_values('cat_count', ascending=False)['category'].values[0:3]
#     annot2 = '\n\n'.join(['\n'.join(wrap(line, 18)) for line in [i.split(',')[0] for i in annot2]])
#     # annot2 = '\n'.join(wrap(annot2, 18)) # breaks strings into new lines

#     plt.annotate(annot2, xy=(i/df['cluster'].nunique()-0.115, -0.24), 
#                 xytext=(i/df['cluster'].nunique()-0.115, -0.24), 
#                 xycoords='axes fraction', fontsize=9)

# plt.title("Hierarchical Clustering Dendrogram - BERT")

# # make figure bigger
# fig = plt.gcf()
# fig.set_size_inches(14, 10)

# plt.show()

# # save the figure
# fig.savefig('images/images_analysis/DAAOVenues_BERT1.png', dpi=300, bbox_inches='tight')

# # save the data
# df_model_bert = df.merge(clean_data_v2[['venue_name','venue_category_major','venue_category_minor','still_exists','State']], 
#                          left_on='category', right_on='venue_name', how='left').drop_duplicates()

# df_model_bert.to_csv('data/local/DAAOVenues_BERT1.csv', index=False)

from IPython.display import Image
Image(filename='images/images_analysis/DAAOVenues_BERT1.png')
_images/f596b3b89e3df0a5522602a7e2a918d229f8e516f21ec1b9048bb4b61fcd2516.png
Hide code cell source
# read from github
df_model_bert = fetch_small_data_from_github("DAAOVenues_BERT1.csv")

# display data
show(df_model_bert.drop(['venue_name','clean_text','clean_text_sampled','text'],axis=1), scrollY="400px", scrollCollapse=True, scrollX=True,
     paging=False, showIndex=False, column_filters="footer", dom="tpr")
category cat_count cluster venue_category_major venue_category_minor still_exists State
Loading... (need help?)
categorycat_countclustervenue_category_majorvenue_category_minorstill_existsState
Hide code cell source
##### Venue Category #####
# using plotly, create interactive plot bar chart to show the venue categories proportion for each cluster
import plotly.express as px
import plotly.graph_objects as go

df_model_bert_count_prop = pd.merge(df_model_bert[['cluster','venue_category_major']]\
        .groupby(['cluster','venue_category_major'])\
        .size().reset_index(name='Count')\
        .rename(columns={'venue_category_major':'Category','cluster':'Cluster'}),
        df_model_bert[['cluster','venue_category_major']]\
        .groupby(['cluster','venue_category_major'])\
        .size().unstack().apply(lambda x: x/x.sum(), axis=1)\
        .unstack().reset_index(name='Prop')\
        .rename(columns={'venue_category_major':'Category','cluster':'Cluster'}), on = ['Cluster','Category'], 
        how='left')

fig = px.bar(
    data_frame = df_model_bert_count_prop
    ,y = 'Prop'
    ,x = 'Cluster'
    ,color = 'Category'
    ,title='BERT Clusters by major venue category'
    ,height=500
    ,width=800, # add hover
    hover_data = {'Cluster':True, 'Category':True, 'Prop':':.2f', 'Count':True},
)

#move legend to top with two rows
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="center",
    x=0.5,
    font=dict(size=11.5),
    title=None
))

# make y-axis labels larger
fig.update_yaxes(tickfont=dict(size=16),title_text='')
fig.update_xaxes(tickfont=dict(size=14),title_text='Clusters')

fig.show()

We provide the same visual however show the most frequent terms used within the place names as opposed to the event descriptions.

Hide code cell source
# cut = 5
# l_matrix = get_linkage_matrix(model_bert)
# df['cluster'] = fcluster(l_matrix, cut, criterion='maxclust')
# dendrogram(l_matrix, orientation='top', truncate_mode="lastp", p=cut, show_leaf_counts=True)

# all_words = []

# for i in df['cluster'].unique():
#     cluster_docs = df[df['cluster'] == i]
#     # print(i, get_most_common_word(cluster_docs['clean_text']))

#     annot = "\n".join(i[0] for idx,i in enumerate(get_most_common_word(cluster_docs['category'])) if (idx < 3))
    
#     plt.annotate(annot, xy=(i/df['cluster'].nunique()-0.1, 0.15), 
#                  xytext=(i/df['cluster'].nunique()-0.1, 0.15), 
#                  xycoords='axes fraction', fontsize=9, color='red')
    
#     [all_words.append(i[0]) for idx,i in enumerate(get_most_common_word(cluster_docs['category'], no_of_words=25))]
    
# all_words_to_remove = find_duplicates(all_words, occurences=4)
# all_words_to_remove.extend([',','NSW','Sydney','Melbourne','Adelaide','South','SA','Brisbane','VIC',
#                    '.', 'New', 'Australia', 'QLD', 'Vic','and','WA','Victoria','ACT','Qld',
#                    'of','Wollongong','TAS','Queensland','Newcastle',
#                    'Street','Hobart','the','The','Launceston','Orange','NT',
#                    'Paddington','Darwin','for','Western','Warrnambool','Ballarat','Northern','Territory',
#                    'England','Watters','Macquarie','Artspace','St',"'s",'&','Potter','Kings','Ian','Cross',
#                    '8','Llankelly','2011','Fremantle','Queen','Ivan','Dougherty','Tasmania','Central',
#                    'Curtin','France','Tin','Sheds','York','Monash','Heide',''])

# for i in df['cluster'].unique():
#     cluster_docs = df[df['cluster'] == i]
#     # print(i, get_most_common_word(cluster_docs['clean_text']))
#     annot = "\n".join(i[0] for idx,i in enumerate(get_most_common_word(cluster_docs['category'],
#                                                                        more_words=all_words_to_remove)) if (idx < 5))
    
#     plt.annotate(annot, xy=(i/df['cluster'].nunique()-0.1, 0.05), 
#                  xytext=(i/df['cluster'].nunique()-0.1, 0.05), 
#                  xycoords='axes fraction', fontsize=9)

# plt.title("Hierarchical Clustering Dendrogram - BERT")

# # make figure bigger
# fig = plt.gcf()
# fig.set_size_inches(14, 10)

# plt.show()

# # save the figure
# fig.savefig('images/images_analysis/DAAOVenues_BERT1_placenames.png', dpi=300, bbox_inches='tight')

from IPython.display import Image
Image(filename='images/images_analysis/DAAOVenues_BERT1_placenames.png')
_images/c526036b439a1cfa94b44529fb0830e9c06d32e07726cf53b1b70dc8aa052961.png

Hierarchy Clustering - BERT by demi-decade#

Using the BERT model, we generate a series of dendrograms that show the hierarchical clustering of the exhibition descriptions by 5-year periods. We omit pre-1960s data as there is not enough data to generate a meaningful dendrogram for each demi-decade (see table output below).

Hide code cell source
display(clean_data_v2['decade_start'].value_counts().sort_index().reset_index().rename(columns={'index':'decade_start', 'decade_start':'count'}).T)
0 1 2 3 4 5 6 7 8 9 10 11 12
decade_start 1900 1910 1920 1930 1940 1950 1960 1970 1980 1990 2000 2010 2020
count 7 6 4 5 13 39 77 515 1042 1488 1113 212 1

Top ten place names and year

Hide code cell source
display(clean_data_v2[['start_year','address_prompt']].value_counts().head(10))
start_year  address_prompt                                          
1983.0      Experimental Art Foundation, Adelaide, SA                   45
1982.0      Experimental Art Foundation, Adelaide, SA                   30
1979.0      Contemporary Art Centre of South Australia, Adelaide, SA    25
1986.0      Experimental Art Foundation, Adelaide, SA                   23
1978.0      Contemporary Art Centre of South Australia, Adelaide, SA    23
1976.0      Experimental Art Foundation, Adelaide, SA                   22
1997.0      Experimental Art Foundation, Adelaide, SA                   22
1985.0      Experimental Art Foundation, Adelaide, SA                   22
1979.0      Experimental Art Foundation, Adelaide, SA                   22
1977.0      Experimental Art Foundation, Adelaide, SA                   20
dtype: int64

Top ten place names and decade

Hide code cell source
display(clean_data_v2[['decade_start','address_prompt']].value_counts().head(10))
decade_start  address_prompt                                                 
1980          Experimental Art Foundation, Adelaide, SA                          199
1970          Contemporary Art Centre of South Australia, Adelaide, SA           110
1990          Art Gallery of New South Wales, Sydney, NSW                        106
1980          Contemporary Art Centre of South Australia, Adelaide, SA           105
1990          Experimental Art Foundation, Adelaide, SA                          101
              Institute of Modern Art, Brisbane, QLD                              99
1970          Experimental Art Foundation, Adelaide, SA                           98
1990          Contemporary Art Centre of South Australia, Adelaide, SA            96
1980          Museums and Art Galleries of the Northern Territory, Darwin, NT     80
1990          Wollongong City Art Gallery, Wollongong, NSW                        79
dtype: int64
Hide code cell source
def temporal_dendos(startdt=1900, enddt=1920, cut_input=3, model=None):
    df_period = clean_data_v2[(clean_data_v2['start_year'] >= startdt) & (clean_data_v2['start_year'] <= enddt)]
    # df_period = clean_data_v2[(clean_data_v2['decade_start'] >= startdt) & (clean_data_v2['decade_start'] <= enddt)]
    ### pre-process for NLP
    # Load the documents and their corresponding categorical variables into a Pandas dataframe
    df_period = pd.DataFrame({'text': df_period['slug2'], 'category': df_period['address_prompt']})

    # summarise text for each unique place name
    df_period['text'] = df_period.groupby('category')['text'].transform(lambda x: ' '.join(x))

    #add new column with count for each category
    df_period['cat_count'] = df_period.groupby('category')['category'].transform('count')
    df_period.drop_duplicates(inplace=True)

    # Clean the text
    stop_words = set(stopwords.words('english'))
    df_period = df_period[df_period['text'].notnull()]
    df_period['clean_text'] = df_period['text'].apply(clean_text)

    if model is None:
        # randomly sample 512 tokens from each row in df['clean_text']
        # some strings are smalle than 512
        df_period['clean_text_sampled'] = df_period['clean_text'].apply(lambda x: ' '.join(random.sample(x.split(' '), 275)) if len(x.split(' ')) >= 275 else x)
        X_bert_period = df_period['clean_text_sampled'].apply(lambda x: pd.Series(bert_encode([str(x)])[0]))

        # setting distance_threshold=0 ensures we compute the full tree.
        model_bert_period = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
        model_bert_period = model_bert_period.fit(np.array(X_bert_period))

        # save model as pickle
        pickle.dump(model_bert_period, open(f'models/model_bert_{startdt}_{enddt}.pkl', 'wb'))
    else:
        model_bert_period = fetch_bert_models_from_github(f'model_bert_{startdt}_{enddt}.pkl')

    ### generate dendrogram
    cut = cut_input
    l_matrix = get_linkage_matrix(model_bert_period)
    df_period['cluster'] = fcluster(l_matrix, cut, criterion='maxclust')
    dendrogram(l_matrix, orientation='top', truncate_mode="lastp", p=cut, show_leaf_counts=True)

    all_words = []

    for i in df_period['cluster'].unique():
        cluster_docs = df_period[df_period['cluster'] == i]
        # print(i, get_most_common_word(cluster_docs['clean_text']))

        annot = "\n".join(i[0] for idx,i in enumerate(get_most_common_word(cluster_docs['clean_text'])) if (idx < 3))
        
        plt.annotate(annot, xy=(i/df_period['cluster'].nunique()-0.1, 0.15), 
                    xytext=(i/df_period['cluster'].nunique()-0.1, 0.15), 
                    xycoords='axes fraction', fontsize=9, color='red')

        [all_words.append(i[0]) for idx,i in enumerate(get_most_common_word(cluster_docs['clean_text']))]
        
    all_words_to_remove = find_duplicates(all_words, occurences=2)
    all_words_to_remove.extend(['j','th','nd','exhibitionexhibited','http','www','isbn'])

    for i in df_period['cluster'].unique():
        cluster_docs = df_period[df_period['cluster'] == i]
        # print(i, get_most_common_word(cluster_docs['clean_text']))
        annot = "\n".join(i[0] for idx,i in enumerate(get_most_common_word(cluster_docs['clean_text'],
                                                                        more_words=all_words_to_remove)) if (idx < 5))
        
        plt.annotate(annot, xy=(i/df_period['cluster'].nunique()-0.1, 0.025), 
                    xytext=(i/df_period['cluster'].nunique()-0.1, 0.025), 
                    xycoords='axes fraction', fontsize=9)
        
        annot2 = cluster_docs.sort_values('cat_count', ascending=False)['category'].values[0:3]
        annot2 = '\n\n'.join(['\n'.join(wrap(line, 18)) for line in [i.split(',')[0] for i in annot2]])
        # annot2 = '\n'.join(wrap(annot2, 18)) # breaks strings into new lines

        plt.annotate(annot2, xy=(i/df_period['cluster'].nunique()-0.115, -0.24), 
                    xytext=(i/df_period['cluster'].nunique()-0.115, -0.24), 
                    xycoords='axes fraction', fontsize=9)

    plt.title(f"Hierarchical Clustering Dendrogram - BERT - {startdt}-{enddt}")

    # make figure bigger
    fig = plt.gcf()
    fig.set_size_inches(14, 10)
    plt.show()

    df_period_model_bert = df_period.merge(clean_data_v2[['venue_name','venue_category_major','venue_category_minor','still_exists','State']], 
                            left_on='category', right_on='venue_name', how='left').drop_duplicates()
    # display data
    show(df_period_model_bert.drop(['venue_name','clean_text','text'],axis=1), scrollY="400px", scrollCollapse=True, scrollX=True,
        paging=False, showIndex=False, column_filters="footer", dom="tpr")

    print('\n')
    return df_period

# # use this configuration for first time runnning in a local env
# # this will also create a pickle locally for each time period
# df_65_70 = temporal_dendos(1965, 1970, cut_input=5, model=None)
# df_70_75 = temporal_dendos(1970, 1975, cut_input=5, model=None)
# df_75_80 = temporal_dendos(1975, 1980, cut_input=5, model=None)
# df_80_85 = temporal_dendos(1980, 1985, cut_input=5, model=None)
# df_85_90 = temporal_dendos(1985, 1990, cut_input=5, model=None)
# df_90_95 = temporal_dendos(1990, 1995, cut_input=5, model=None)
# df_95_00 = temporal_dendos(1995, 2000, cut_input=5, model=None)
# df_00_05 = temporal_dendos(2000, 2005, cut_input=5, model=None)
# df_05_10 = temporal_dendos(2005, 2010, cut_input=5, model=None)

# use this configuration for pre-loaded pickles from github for each time period
df_65_70 = temporal_dendos(1965, 1970, cut_input=5, model=0)
df_70_75 = temporal_dendos(1970, 1975, cut_input=5, model=0)
df_75_80 = temporal_dendos(1975, 1980, cut_input=5, model=0)
df_80_85 = temporal_dendos(1980, 1985, cut_input=5, model=0)
df_85_90 = temporal_dendos(1985, 1990, cut_input=5, model=0)
df_90_95 = temporal_dendos(1990, 1995, cut_input=5, model=0)
df_95_00 = temporal_dendos(1995, 2000, cut_input=5, model=0)
df_00_05 = temporal_dendos(2000, 2005, cut_input=5, model=0)
df_05_10 = temporal_dendos(2005, 2010, cut_input=5, model=0)
_images/b29b98ac66d760cd03b36a04112f8d481022291a0f6c031e0234a9f4d208e1c2.png
category cat_count cluster venue_category_major venue_category_minor still_exists State
Loading... (need help?)
categorycat_countclustervenue_category_majorvenue_category_minorstill_existsState

_images/44e7efcf31ea70c86cb51f85c31977ab7736d2c4d961d773476cef22deb00435.png
category cat_count cluster venue_category_major venue_category_minor still_exists State
Loading... (need help?)
categorycat_countclustervenue_category_majorvenue_category_minorstill_existsState

_images/c876a71efd222939be4b2b72554429d9ee7e8ed879342ee3e362fe201576fa71.png
category cat_count cluster venue_category_major venue_category_minor still_exists State
Loading... (need help?)
categorycat_countclustervenue_category_majorvenue_category_minorstill_existsState

_images/0ea912472630abeb781a89d8062f55653c6221c8b286278ca586757c80691623.png
category cat_count cluster venue_category_major venue_category_minor still_exists State
Loading... (need help?)
categorycat_countclustervenue_category_majorvenue_category_minorstill_existsState

_images/180c00a1ffcc76f01cef4d53b2d0ee5280f0da16e5db34dc4cfaf8850254462e.png
category cat_count cluster venue_category_major venue_category_minor still_exists State
Loading... (need help?)
categorycat_countclustervenue_category_majorvenue_category_minorstill_existsState

_images/176c41058fe04021eac7aacabe11ee4085861882babeeb3431d5d29fc7d55f77.png
category cat_count cluster venue_category_major venue_category_minor still_exists State
Loading... (need help?)
categorycat_countclustervenue_category_majorvenue_category_minorstill_existsState

_images/1b285e6fe3af392493dafe2cba6a4e53f97138316cc937c6e977393422c7a139.png
category cat_count cluster venue_category_major venue_category_minor still_exists State
Loading... (need help?)
categorycat_countclustervenue_category_majorvenue_category_minorstill_existsState

_images/79018c852d68497b676c6e5f8fe28b7106c4592e4a006cb227dcacbc16e1020f.png
category cat_count cluster venue_category_major venue_category_minor still_exists State
Loading... (need help?)
categorycat_countclustervenue_category_majorvenue_category_minorstill_existsState

_images/23206c78ac1cc3c42a6d4fb954d728dcbdd0466a08bcb01777126dd51fa02856.png
category cat_count cluster venue_category_major venue_category_minor still_exists State
Loading... (need help?)
categorycat_countclustervenue_category_majorvenue_category_minorstill_existsState

Experimental Art Foundation, Adelaide, SA#

Hide code cell source
df_test = pd.DataFrame()
venue = 'Experimental Art Foundation, Adelaide, SA'
df_test = try_merge(df_test, df_65_70, venue)
df_test = try_merge(df_test, df_70_75, venue)
df_test = try_merge(df_test, df_75_80, venue)
df_test = try_merge(df_test, df_80_85, venue)
df_test = try_merge(df_test, df_85_90, venue)
df_test = try_merge(df_test, df_90_95, venue)
df_test = try_merge(df_test, df_95_00, venue)
df_test = try_merge(df_test, df_00_05, venue)
df_test = try_merge(df_test, df_05_10, venue)
print(df_test['category'].value_counts().head(6).tail(-1))
Queensland Art Gallery, Brisbane, QLD                     4
Art Gallery of New South Wales, Sydney, NSW               3
National Gallery of Victoria, Melbourne, VIC              3
Orange Regional Gallery, Orange, NSW                      3
Australian Centre for Contemporary Art, Melbourne, VIC    3
Name: category, dtype: int64

Contemporary Art Centre of South Australia, Adelaide, SA#

Hide code cell source
df_test = pd.DataFrame()
venue = 'Contemporary Art Centre of South Australia, Adelaide, SA'
df_test = try_merge(df_test, df_65_70, venue)
df_test = try_merge(df_test, df_70_75, venue)
df_test = try_merge(df_test, df_75_80, venue)
df_test = try_merge(df_test, df_80_85, venue)
df_test = try_merge(df_test, df_85_90, venue)
df_test = try_merge(df_test, df_90_95, venue)
df_test = try_merge(df_test, df_95_00, venue)
df_test = try_merge(df_test, df_00_05, venue)
df_test = try_merge(df_test, df_05_10, venue)
print(df_test['category'].value_counts().head(6).tail(-1))
Queen Victoria Museum and Art Gallery, Launceston, TAS    7
Ivan Dougherty Gallery, Sydney, NSW                       5
Art Gallery of New South Wales, Sydney, NSW               4
National Gallery of Australia, Canberra, ACT              4
University of Melbourne, Melbourne, Vic                   4
Name: category, dtype: int64

Organisations#

Similar to venues, we first produce a word cloud for common terms used in organisation names. We find that the most frequent terms are “Ltd”, “Pty”, “Australia”, “Art”, and “Design”.

Hide code cell source
# fetch organisation data from github
org_data = fetch_data(acdedata='organization')
daoo_org_data = org_data[(org_data.data_source.str.contains('DAAO')) & (org_data.primary_name.notnull())]
all_place_names = daoo_org_data.primary_name.apply(lambda x: ast.literal_eval(x)).astype(str).tolist()

# create a list of all the words in the place names
all_words = []
for place_name in all_place_names:
    all_words.extend(word_tokenize(place_name))

# find top 100 most frequent words
most_freq_words = Counter(all_words).most_common(30)

# remove a list of all words that are not relevant
words_to_remove = [',','.','and','of','the','The','for',"'s",'&','(',')','J.','Melbourne',
                   'UK','VIC','London','A',
                   'Jane','Lapham','*','Adelaide',"'",'Sydney','NSW','New','South','Wales']

# remove the words from the list of most frequent words
most_freq_words = [word for word in most_freq_words if word[0] not in words_to_remove]

most_freq_words_dict = dict(most_freq_words)

# # add value of two keys
# most_freq_words_dict['Gallery'] = most_freq_words_dict['Gallery'] + most_freq_words_dict['Galleries']
# most_freq_words_dict['Museum'] = most_freq_words_dict['Museum'] + most_freq_words_dict['Museums']

# # remove key 'Gallery'
# most_freq_words_dict.pop('Galleries')
# most_freq_words_dict.pop('Museums')

# create a wordcloud with the most frequent words
from wordcloud import WordCloud

wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10, random_state=300).generate_from_frequencies(most_freq_words_dict)

# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()
_images/054cc2ee3e94125b0d7740f57c9aaa7ddc225f8406617597649a493cd5bb0731.png

As shown above, the use of dendrograms can be an interesting exercise to explore the data. The initial idea was to repeat the same approach with organisations to assess any latent patterns through this taxonomic approrach. However, we find that the data is not as rich as the venue data and that there are many organisations with no biographies, no summaries, and no relations to people records, exhibition records, etc. Nevertheless we provide one dendrogram using organisation biographies.

Below is a list of the proportion of missing data for each field across DAAO organisations.

Hide code cell source
# find the proportion of null values in each column
display((daoo_org_data.isnull().sum()/daoo_org_data.shape[0]))
_id                      0.000000
data_source              0.000000
_class                   0.000000
_class_ori               0.000000
ori_id                   1.000000
ori_dbid                 0.000000
ori_url                  0.000000
date_created             0.105490
date_modified            0.320775
record_status            0.000000
version                  0.000000
contributors             1.000000
note                     1.000000
summary                  0.609257
primary_name             0.000000
alternative_names        0.846071
types                    0.864370
longterm_roles           0.000000
operation                0.000000
locations                0.819160
nla                      0.996771
biography                0.534984
contact                  1.000000
copyright_agents         0.997847
editing_complete         0.789020
email                    1.000000
fax                      1.000000
is_australian            1.000000
is_deleted               0.000000
is_featured              0.000000
is_locked                0.000000
is_primary               0.000000
is_shadow                0.000000
locked_biographies       0.997847
ori_dbid_unf             1.000000
ori_url_data             1.000000
other_occupations        0.987083
periods_active           0.814855
phones                   1.000000
place_of_demise          1.000000
place_of_origin          1.000000
pubts                    1.000000
references               0.503767
see_alsos                0.913886
source_database_ids      0.928956
sources                  0.603875
tags                     0.820237
urls                     0.853606
web_links                1.000000
related_events           1.000000
related_organizations    1.000000
related_people           1.000000
related_recognitions     1.000000
related_resources        1.000000
related_works            1.000000
dtype: float64
Hide code cell content
from bs4 import BeautifulSoup

orgs = pd.DataFrame(columns=['name','summary','bio'])

for i,row in daoo_org_data.iterrows():
    try:
        try: pd.json_normalize(json.loads(row['biography']))['text']
        except: continue

        try: this_summary = ast.literal_eval(row['summary'])
        except: this_summary = None

        # try: period_end = row['periods_active'][0]['end']['_date']
        # except: period_end = None

        # use pandas.concat to append new row to dataframe
        orgs = pd.concat([orgs, pd.DataFrame({'name': [ast.literal_eval(row['primary_name'])],
                                                'summary': [this_summary],
                                                'bio': [pd.json_normalize(json.loads(row['biography']))['text'].values[0]]
                                                })], ignore_index=True)
        
    except:
        print(i)
        break

# remove empty summary
orgs['summary'].fillna('', inplace=True)

# remove rows with stub text
orgs = orgs[~orgs['bio'].isin(orgs['bio'].value_counts().head(5).index.to_list())]

# combine summary and bio
orgs['bio'] = orgs['summary'] + orgs['bio'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

orgs = orgs[['name','bio']]
Hide code cell content
### load bert model
# # !pip install transformers
# from transformers import BertTokenizer, BertModel
# from transformers import pipeline
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained("bert-base-uncased")

# ### encode text using bert (takes 8secs to run)
# import random
# import pickle

# # encode the text using bert
# def bert_encode(x):
#   encoded_input = tokenizer(x, return_tensors='pt')
#   output = model(**encoded_input)
#   return pd.DataFrame(output['pooler_output'].detach().numpy()).T

# ### pre-process for NLP
# # Load the documents and their corresponding categorical variables into a Pandas dataframe
# df_org = pd.DataFrame({'text': orgs['bio'], 'category': orgs['name']})

# #add new column with count for each category
# df_org['cat_count'] = df_org.groupby('category')['category'].transform('count')
# df_org.drop_duplicates(inplace=True)

# # Clean the text
# stop_words = set(stopwords.words('english'))

# def clean_text(text):
#     text = re.sub('[^a-zA-Z]', ' ', text)
#     text = text.replace(',', '')
#     text = text.lower().split()
#     text = [word for word in text if word not in stop_words]
#     text = ' '.join(text)
#     return text

# df_org['clean_text'] = df_org['text'].apply(clean_text)

# # randomly sample 512 tokens from each row in df['clean_text']
# # some strings are smalle than 512
# df_org['clean_text_sampled'] = df_org['clean_text'].apply(lambda x: ' '.join(random.sample(x.split(' '), 350)) if len(x.split(' ')) >= 350 else x)
# X_bert_org = df_org['clean_text_sampled'].apply(lambda x: pd.Series(bert_encode([str(x)])[0]))

# # setting distance_threshold=0 ensures we compute the full tree.
# model_bert_org = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
# model_bert_org = model_bert_org.fit(np.array(X_bert_org))

# # save model as pickle
# pickle.dump(model_bert_org, open('models/model_bert_orgs.pkl', 'wb'))

Biographical data for each organisation (~120 organisations) is used to create the clusters. From here we show the most common terms used in the organisation name for each cluster.

Hide code cell source
# # fetch bert model from github
# model_bert_org = fetch_bert_models_from_github("model_bert_orgs.pkl")

# cut = 5
# l_matrix = get_linkage_matrix(model_bert_org)
# df_org['cluster'] = fcluster(l_matrix, cut, criterion='maxclust')
# dendrogram(l_matrix, orientation='top', truncate_mode="lastp", p=cut, show_leaf_counts=True)

# all_words = []

# for i in df_org['cluster'].unique():
#     cluster_docs = df_org[df_org['cluster'] == i]
#     # print(i, get_most_common_word(cluster_docs['clean_text']))
#     [all_words.append(i[0]) for idx,i in enumerate(get_most_common_word(cluster_docs['category'], no_of_words=5))]
    
# all_words_to_remove = find_duplicates(all_words, occurences=4)
# all_words_to_remove.extend(['h','co','e','test','Lapham','J.','Jane','','and','of','&','The','SymbioticA','Nevin','Smith'])

# for i in df_org['cluster'].unique():
#     cluster_docs = df_org[df_org['cluster'] == i]
#     # print(i, get_most_common_word(cluster_docs['clean_text']))
#     annot = "\n".join(i[0] for idx,i in enumerate(get_most_common_word(cluster_docs['category'], no_of_words=6,
#                                                                        more_words=all_words_to_remove)) if (idx < 3))
    
#     plt.annotate(annot, xy=(i/df_org['cluster'].nunique()-0.1, 0.05), 
#                  xytext=(i/df_org['cluster'].nunique()-0.1, 0.05), 
#                  xycoords='axes fraction', fontsize=12)
    
#     # annot2 = cluster_docs.sort_values('cat_count', ascending=False)['category'].values[0:3]
#     # annot2 = '\n\n'.join(['\n'.join(wrap(line, 18)) for line in [i.split(',')[0] for i in annot2]])
#     # # annot2 = '\n'.join(wrap(annot2, 18)) # breaks strings into new lines

#     # plt.annotate(annot2, xy=(i/df_org['cluster'].nunique()-0.115, -0.24), 
#     #             xytext=(i/df_org['cluster'].nunique()-0.115, -0.24), 
#     #             xycoords='axes fraction', fontsize=9)

# plt.title("Hierarchical Clustering Dendrogram - BERT, Organisations")

# # make figure bigger
# fig = plt.gcf()
# fig.set_size_inches(14, 10)

# plt.show()

# # save the figure
# fig.savefig('images/images_analysis/DAAOVenues_BERT_orgs.png', dpi=300, bbox_inches='tight')
# df_org.to_csv('data/local/DAAOVenues_BERT_orgs.csv', index=False)

from IPython.display import Image
Image(filename='images/images_analysis/DAAOVenues_BERT_orgs.png')
_images/eade96e058d2484e55dc9485c5dd96510351c13650b24a17c08f234adf0ff903.png
Hide code cell source
# read from github
df_model_bert_org = fetch_small_data_from_github("DAAOVenues_BERT_orgs.csv")

# display data
show(df_model_bert_org.drop(['clean_text','clean_text_sampled','text','cat_count'],axis=1), scrollY="400px", scrollCollapse=True,
     paging=False, showIndex=False, column_filters="footer", dom="tpr")
category cluster
Loading... (need help?)
categorycluster

Temporal analysis of venues#

To further explore frequently used terms in place names, we produce an interactive time series of the number of events per decade by associated terms.

Before the visuals, we provide an ordered list of the decades with the most event activity, and also a list of the most frequent terms used in place names.

Hide code cell source
events_df = clean_data_v2.copy()

# create a new column with the decade of start year
events_df['decade'] = events_df['start_year'].apply(lambda x: str(x)[:3] + '0')
events_df['decade'] = events_df['decade'].apply(lambda x: '2000' if x == '2020' else x)
events_df['decade'] = events_df['decade'].astype(int)
display(events_df['decade'].value_counts())
1990    1488
2000    1114
1980    1042
1970     515
2010     212
1960      77
1950      39
1940      13
1900       7
1910       6
1930       5
1920       4
Name: decade, dtype: int64
Hide code cell source
most_freq_words_dict2
{'Art': 2992,
 'Gallery': 2606,
 'University': 502,
 'Museum': 473,
 'Contemporary': 472,
 'Centre': 456,
 'Experimental': 424,
 'Foundation': 424,
 'Modern': 290,
 'City': 282,
 'National': 249,
 'Institute': 235,
 'Regional': 184,
 'Museums': 128,
 'Australian': 116,
 'Place': 108,
 'Region': 106,
 'Fine': 102}
Hide code cell source
words_by_decade = pd.DataFrame()

for key in most_freq_words_dict2:
    words_this_decade = events_df[events_df['address_prompt'].str.contains(key, na=False)]\
                        .groupby('decade')['address_prompt']\
                        .count()\
                        .reset_index(name='count')\
                        .sort_values(['decade'], ascending=True)
    words_this_decade['term'] = key
    words_by_decade = pd.concat([words_by_decade, words_this_decade], ignore_index=True)
Hide code cell source
import plotly.express as px
fig = px.line(words_by_decade, x="decade", y="count", color='term',
                title='Count of each term by decade')


# # make figure size bigger
# # fig.update_layout(
# #     autosize=False,
# #     width=700,
# #     height=500,
# # )

fig.show()
Hide code cell source
# events_df['diff'] = events_df['end_year'] - events_df['start_year']
# # events_df[(events_df['place_name'].str.contains('Contemporary')) & (events_df['diff'] > 1)]

# using start_date, end_date, place_name, calculate the difference between start_date of the first event in an venue and end_date of the last event in that venue
places_with_start_end = []
[places_with_start_end.extend([p, events_df.loc[events_df['address_prompt'] == p, 'start_year'].min(), 
                              events_df.loc[events_df['address_prompt'] == p, 'end_year'].max()] for p in events_df['place_address'].unique())]

places_with_start_end = pd.DataFrame(places_with_start_end)
places_with_start_end.columns = ['address_prompt', 'start_year', 'end_year']
places_with_start_end['diff'] = places_with_start_end['end_year'] - places_with_start_end['start_year']
places_with_start_end = places_with_start_end[places_with_start_end['diff'] >= 0]
# places_with_start_end.sort_values('diff', ascending=False)

Beyond the above time series, we also provide a series of bar charts, however instead of illustrating event activity within a certain decade, we plot the number of venues active across time. We calculate the start and end of a venue’s “lifespan” by using the earliest and latest event date associated with a venue. We do this by filtering on venues with associated terms in the place name.

Hide code cell source
decade_placeholder = pd.DataFrame([0] * 13, index=range(1900, 2020+1, 10)).reset_index().rename(columns={'index':'decade', 0:'count'})
decade_placeholder['decade'] = decade_placeholder['decade'].astype(str)

for term in most_freq_words_dict2:
    # create a wide form of contemporary with each decade as a column and a binary value for whether the venue existed in that decade
    # use start_year as the start and end_year as the end of the venue
    contemporary = places_with_start_end[(places_with_start_end['address_prompt'].str.contains(term))].sort_values('start_year', ascending=True)
    contemporary_wide = pd.DataFrame()

    for i,row in contemporary.iterrows():
        for year in range(int(row['start_year']), int(row['end_year'])+1): 
            contemporary_wide.loc[row['address_prompt'], str(year)[:3] + '0'] = 1

    contemporary_wide = contemporary_wide.\
        fillna(0).\
        reset_index().\
        rename(columns={'index':'address_prompt'}).\
        sum().tail(-1).\
        reset_index().\
        rename(columns={'index':'decade', 0:'count'})
    
    contemporary_wide = pd.merge(contemporary_wide, decade_placeholder, on='decade', how='outer')
    contemporary_wide['count'] = contemporary_wide['count_x'] + contemporary_wide['count_y']
    contemporary_wide = contemporary_wide[['decade', 'count']].sort_values('decade', ascending=True)

    fig = px.bar(contemporary_wide, x="decade", y='count',
                    title=f'Number of venues that existed in each decade, {term}')

    # remove y-axis label
    fig.update_yaxes(title_text='')
    fig.update_xaxes(title_text='')

    # # make figure size bigger
    # fig.update_layout(
    #     autosize=False,
    #     width=700,
    #     height=400,
    # )

    fig.show()

Proportions#

We repeat the same temporal visuals as above, however this time we plot the proportions to effectively normalise the data with respect to the number of events across decades, and the number of venues that existed across time, repsectively.

Note that the y-axis limits in the bar charts changes for each term.

Hide code cell source
words_by_decade_prop = []

# find the proportions of each word in each decade
for i in most_freq_words_dict2.keys():
    for j in events_df['decade'].unique():
        prop = events_df[(events_df['address_prompt'].str.contains(i)) & (events_df['decade'] == j)].shape[0] /events_df[events_df['decade'] == j].shape[0]
        words_by_decade_prop.append([i,j,prop])
    
words_by_decade_prop = pd.DataFrame(words_by_decade_prop)
words_by_decade_prop.columns = ['word','decade','proportion']
Hide code cell source
# plot proportion of each term by decade
# make it interactive

import plotly.express as px

fig = px.line(words_by_decade_prop.sort_values('decade'), x="decade", y="proportion", color='word',
                title='Proportion of each term by decade')

# make figure size bigger
# fig.update_layout(
#     autosize=False,
#     width=700,
#     height=500,
# )

fig.show()
Hide code cell source
allplaces_wide = pd.DataFrame()

for i,row in places_with_start_end.iterrows():
    for year in range(int(row['start_year']), int(row['end_year'])+1): 
        allplaces_wide.loc[row['address_prompt'], str(year)[:3] + '0'] = 1

decade_placeholder_forprop = allplaces_wide.\
                            fillna(0).\
                            reset_index().\
                            rename(columns={'index':'address_prompt'}).\
                            sum().tail(-1).\
                            reset_index().\
                            rename(columns={'index':'decade', 0:'count'}).\
                            sort_values('decade', ascending=True)
Hide code cell source
for term in most_freq_words_dict2:
    # create a wide form of contemporary with each decade as a column and a binary value for whether the venue existed in that decade
    # use start_year as the start and end_year as the end of the venue
    contemporary = places_with_start_end[(places_with_start_end['address_prompt'].str.contains(term))].sort_values('start_year', ascending=True)
    contemporary_wide = pd.DataFrame()

    for i,row in contemporary.iterrows():
        for year in range(int(row['start_year']), int(row['end_year'])+1): 
            contemporary_wide.loc[row['address_prompt'], str(year)[:3] + '0'] = 1

    contemporary_wide = contemporary_wide.\
        fillna(0).\
        reset_index().\
        rename(columns={'index':'address_prompt'}).\
        sum().tail(-1).\
        reset_index().\
        rename(columns={'index':'decade', 0:'count'})
    
    contemporary_wide = pd.merge(contemporary_wide, decade_placeholder_forprop, on='decade', how='outer')
    contemporary_wide['prop'] = np.where(contemporary_wide['count_x'] > 0, 
                                     contemporary_wide['count_x']/contemporary_wide['count_y'], 0)
    contemporary_wide = contemporary_wide[['decade', 'prop']].sort_values('decade', ascending=True)

    fig = px.bar(contemporary_wide, x="decade", y='prop',
                    title=f'Number of venues that existed in each decade, {term}')

    # remove y-axis label
    fig.update_yaxes(title_text='')
    fig.update_xaxes(title_text='')

    # make figure size bigger
    # fig.update_layout(
    #     autosize=False,
    #     width=700,
    #     height=400,
    # )

    fig.show()