Exploring success quantitatively using Archibald Prize data

Exploring success quantitatively using Archibald Prize data#

This is an exploratory data analysis of collected data from Art Gallery NSW among other external sources. We focus on the Archibald Prize and take a deep dive into temporal trends relating to gender, portrait characteristics and career paths. Data ranges over 100 years (1921-2022).

The data consists of…

participation records
prize money records
image data of winning portraits
basic biographical data for winners

The analytical work presented on this page served as the initial exploratory data analysis for a Guardian article published in May 2023.

Import packages and pre-process data#

We have provided the code used to scrape the raw data from the Art Gallery NSW website that is processed in this analysis, but for the sake of brevity we will not run the scraping code here. Instead, we will import pre-processed data from the data/analysis folder located in Github.

Show code cell source Hide code cell source

########## Collect data from the Art Gallery of NSW website ###########
global mainURL
mainURL = 'https://www.artgallery.nsw.gov.au/'

def assort_prize_metadata(text):
    prize_dict = dict({'Entries':'',
                       'Presenting partner':'',
                       'Sponsor':'',
                       'Exhibition dates':'', 
                       'Misc.':'',
                       'Text':''})
    for t in text:
        for k in list(prize_dict.keys())[:-2]:
            if k in t: 
                if '  ' not in t: prize_dict[k] = t.strip().replace(k + ': ','')
                else: 
                    prize_dict[k] = t.split('  ')[0]
                    prize_dict['Text'] = t.split(prize_dict[k])[1]
                break

    if prize_dict['Text'] == '':
        prize_dict['Misc.'] = text[-1].split('  ')[0]
        if len(prize_dict['Misc.']): 
            prize_dict['Text'] = text[-1].split(prize_dict['Misc.'])[1]    
        
    return prize_dict

def collect_records(prize = 'archibald', prize_year = 1921):
    prize_url = mainURL + "prizes/" + prize + '/' + str(prize_year)
    page = requests.get(prize_url)
    soup = BeautifulSoup(page.content, "html.parser")
    
    # fetch winner data
    try:
        winner_artist = soup.find_all("span", class_="card-prizesWinner-artist")[0].text
        winner_title = soup.find_all("span", class_="card-prizesWinner-title")[0].text
        
        try: 
            winner_image = soup.find_all("img", class_="card-prizesWinner-image")[0].get('src')
            with open('ArchibaldWinners/' + str(yr) + '_' + basename(mainURL + winner_image), "wb") as f: 
                f.write(requests.get(mainURL + winner_image).content)
        except: winner_image = None
        
        winner_info = [winner_artist,winner_title,winner_image]
    except:
        winner_info = [None,None,None]
    
    # download winning image
    # with open(basename(winner_image),"wb") as f: f.write(requests.get(mainURL + winner_image).content)

    # pre-process
    delimiter = '###'                           # unambiguous string
    for line_break in soup.findAll('br'):       # loop through line break tags
        line_break.replaceWith(delimiter)       # replace br tags with delimiter
    textModule = soup.find("div", class_="grid text").get_text().split(delimiter)  # get list of strings
    
    # fetch prize metadata
    prize_metadata_dict = assort_prize_metadata(text=textModule)
    prize_metadata_dict['winner_info'] = winner_info
    
    # fetch participant data
    participants = []
    
    if len(soup.find_all("div", class_="grid text")) > 1:
        for item in soup.find_all("div", class_="grid text")[1].find_all('ul')[0].find_all('li'):
            
            try: participant_href = item.find_all("a")[0].get('href')
            except: participant_href = ''
                
            participant_artist = item.find_all("strong")[0].text
            participant_title = item.find_all("em")[0].text
            
            try: participant_label = item.text.split(participant_title)[-1].strip()
            except: participant_label = ''
                
            participants.append([participant_href, participant_artist, participant_title, participant_label])
    else:
        for item in soup.find_all("div", class_="artworksList-item"):
            participant_href = item.find_all("a", class_="card-artwork-link")[0].get('href')
            participant_artist = item.find_all("span", class_="card-artwork-artist")[0].text
            participant_title = item.find_all("span", class_="card-artwork-title")[0].text
            participant_label = item.find_all("p", class_="card-artwork-label")[0].text
            participants.append([participant_href, participant_artist, participant_title, participant_label])
            
    prize_metadata_dict['participant_info'] = participants
    return prize_metadata_dict

def fetch_archibald_data():
    archibald_data_dict = dict({'Prize Data':[],'Year':[]})

    # pre 1991/92
    for yr in range(1921,1991):
        try: archibald_data_dict['Prize Data'].append(
            collect_records(prize = 'archibald', prize_year = yr))
        except: archibald_data_dict['Prize Data'].append(None)
        archibald_data_dict['Year'].append(yr)

    # 1991/92 exception
    try: archibald_data_dict['Prize Data'].append(
        collect_records(prize = 'archibald', prize_year = '1991-92'))
    except: archibald_data_dict['Prize Data'].append(None)
    archibald_data_dict['Year'].append('1992')

    # post 1991/92
    for yr in range(1993,2023):
        try: archibald_data_dict['Prize Data'].append(
            collect_records(prize = 'archibald', prize_year = yr))
        except: archibald_data_dict['Prize Data'].append(None)
        archibald_data_dict['Year'].append(yr)

    # Convert dictionary as dataframe and write as csv file
    archies = pd.DataFrame(archibald_data_dict)
    archies.to_csv('data/local/archies.csv', index=False)

    # Convert participant data as dataframe and write as csv file
    partipants_names = pd.DataFrame()

    for idx,x in enumerate(archibald_data_dict['Prize Data']):
        this_yr = pd.DataFrame(x['participant_info'])
        this_yr['Year'] = archibald_data_dict['Year'][idx]
        partipants_names = partipants_names.append(this_yr)
        
    partipants_names.to_csv('data/local/archies_allparticipants_byyear.csv')

# uncomment the line below to repeat the data collection process
# note this will export a file into your local directory - change this accordingly
# fetch_archibald_data()

########### Read csv file as dataframe ###########
# this imported dataset was further preprocessed by filtering on winners 
# and manually adding columns in regard to each winner's biographical information
# along with corresponding ANZSCO classification data

def fetch_small_data_from_github(fname):
    url = f"https://raw.githubusercontent.com/acd-engine/jupyterbook/master/data/analysis/{fname}"
    response = requests.get(url)
    rawdata = response.content.decode('utf-8')
    return pd.read_csv(io.StringIO(rawdata))

# fetch data from github
archies = fetch_small_data_from_github("archies_preprocessed.csv")

# display data
show(archies, scrollY="400px", scrollCollapse=True, scrollX=True,
     paging=False, showIndex=False, column_filters="footer", dom="tpr")

YEAR	WINNER	GENDER	DOB	DOD	PORTRAIT_TITLE	Sitter	Sitter_DOB	Sitter_Age	Self_Portait	PORTRAIT_GENDER	PORTRAIT_OCCUPATION	OCCUPATION_CATEGORY_1	OCCUPATION_CATEGORY_2	ANZSCO_1	ANZSCO_2
Loading... (need help?)
YEAR	WINNER	GENDER	DOB	DOD	PORTRAIT_TITLE	Sitter	Sitter_DOB	Sitter_Age	Self_Portait	PORTRAIT_GENDER	PORTRAIT_OCCUPATION	OCCUPATION_CATEGORY_1	OCCUPATION_CATEGORY_2	ANZSCO_1	ANZSCO_2

Gender distribution#

Male and female distribution for Archibald winners#

We use a donut chart to first explore how gender has been recorded for Archibald winners; 88% of the data has been recorded as Male and 12% as Female.

It should be noted that for three years (1964, 1980 and 1991), there were no Archibald prize winners.

_images/b8c85161ea5400abdef6fd93b86e85460a267dcb1aa01284b84022e0f0acc94a.png

Male and female distribution of sitters for winning Archibald portraits#

Beyond the winning painter, we also assess the gender distribution of the sitters within the winning portraits Again we use a donut chart to explore the distribution. According data collected from various online sources, we found that 82% of sitters were recorded as Male, and 18% as Female.

_images/8c19814d91593a28f89f37819cd1ca735c7b446f8e5209b8ff40e339fafacfcc.png

Do males paint males?#

We also consider the gender distribution of sitters by male and female Archibald winners. The clustered bar chart shows that 86% of winning portraits painted by males consisted of male sitters. This differs quite a bit to winning portraits painted by females, which consists of an even distribution (50% male sitters, 50% female sitters). It should be noted that there are 12 winning portraits painted by females.

_images/08801c7d392f9580a8b96bad86c0fe954872fefefea61efe07e9491682e5ceb6.png

Male and female distribution over time#

The two time series visualisations below showcase the number of Archibald winners and sitters across twenty-year brackets. The data for Archibald winners reveals that only in recent decades have females won a higher proportion of Archibald prizes in comparison to their corresponding vicennium. The trend for sitters also shares a similar pattern to the Archibald winners time series. Following our previous insights, this suggests that as more female artists win Archibalds, there is a corresponding increase in the number of female sitters being painted.

Show code cell source Hide code cell source

### create a new column for the year of the vicennium
archies['year_vicennium'] = [ int(np.floor(int(year)/20) * 20) 
                            for year in np.array(archies['YEAR'])]

archies['year_vicennium'] = np.where(archies['year_vicennium'] == 2020, 2000, archies['year_vicennium'])

### get count by gender
males_tab = archies[archies['GENDER'] == 'Male']['year_vicennium']\
.value_counts()\
.reset_index()\
.sort_values('index')

females_tab = archies[archies['GENDER'] == 'Female']['year_vicennium']\
.value_counts()\
.reset_index()\
.sort_values('index')

males_sitters_tab = archies[archies['PORTRAIT_GENDER'] == 'Male']['year_vicennium']\
.value_counts()\
.reset_index()\
.sort_values('index')

females_sitters_tab = archies[archies['PORTRAIT_GENDER'] == 'Female']['year_vicennium']\
.value_counts()\
.reset_index()\
.sort_values('index')

### merge tables and get row proportions for Males and Females
count_by_gender = pd.merge(males_tab, females_tab, on='index', how='outer').fillna(0)
count_by_gender.columns = ['Vicennium', 'Males', 'Females']
count_by_gender['Females_Prop'] = round(count_by_gender['Females']/(count_by_gender['Females'] + count_by_gender['Males']),2)
count_by_gender['Males_Prop'] = round(count_by_gender['Males']/(count_by_gender['Females'] + count_by_gender['Males']),2)

count_by_gender_sitter = pd.merge(males_sitters_tab, females_sitters_tab, on='index', how='outer').fillna(0)
count_by_gender_sitter.columns = ['Vicennium', 'Males', 'Females']
count_by_gender_sitter['Females_Prop'] = round(count_by_gender_sitter['Females']/(count_by_gender_sitter['Females'] + count_by_gender_sitter['Males']),2)
count_by_gender_sitter['Males_Prop'] = round(count_by_gender_sitter['Males']/(count_by_gender_sitter['Females'] + count_by_gender_sitter['Males']),2)

### plot gender proportions of winners over time
fig, ax = plt.subplots(figsize=(10, 6))

plt.plot(count_by_gender['Vicennium'], 
        count_by_gender['Males_Prop'], 
        label="Males", marker='o')
plt.plot(count_by_gender['Vicennium'], 
        count_by_gender['Females_Prop'], 
        label="Females", marker='o')

for i, txt in enumerate(count_by_gender['Males_Prop']):
    ax.annotate(str(int(round(txt*100,0)))+ '%', (count_by_gender['Vicennium'][i], 
                     count_by_gender['Males_Prop'][i]*1.035), 
                ha='center', va='bottom', size=12.5)

for i, txt in enumerate(count_by_gender['Females_Prop']):
    ax.annotate(str(int(round(txt*100,0)))+ '%', (count_by_gender['Vicennium'][i], 
                     count_by_gender['Females_Prop'][i]*1.1), 
                ha='center', va='bottom', size=12.5)

# adjust legend
ax.legend(loc="upper right", ncol=2)

ax.yaxis.set_ticklabels([])
ax.yaxis.set_ticks([])
plt.xlabel('')
plt.ylim([-0.1, 1.23])
plt.grid(axis='x')
plt.xticks([1920,1940,1960,1980,2000], ['1920-1940', '1940-1960', '1960-1980','1980-2000', '2000-'])
plt.title('Proportion of Archibald winners,\nMales and Females, 20-year periods')
plt.show()

### plot gender proportions of sitters over time
fig, ax = plt.subplots(figsize=(10, 6))

plt.plot(count_by_gender_sitter['Vicennium'], 
        count_by_gender_sitter['Males_Prop'], 
        label="Males", marker='o')
plt.plot(count_by_gender_sitter['Vicennium'], 
        count_by_gender_sitter['Females_Prop'], 
        label="Females", marker='o')

for i, txt in enumerate(count_by_gender_sitter['Males_Prop']):
    ax.annotate(str(int(round(txt*100,0)))+ '%', (count_by_gender_sitter['Vicennium'][i], 
                     count_by_gender_sitter['Males_Prop'][i]*1.035), 
                ha='center', va='bottom', size=12.5)

for i, txt in enumerate(count_by_gender_sitter['Females_Prop']):
    ax.annotate(str(int(round(txt*100,0)))+ '%', (count_by_gender_sitter['Vicennium'][i], 
                     count_by_gender_sitter['Females_Prop'][i]*1.09), 
                ha='center', va='bottom', size=12.5)

# adjust legend
ax.legend(loc="upper right", ncol=2)

ax.yaxis.set_ticklabels([])
ax.yaxis.set_ticks([])
plt.xlabel('')
plt.ylim([-0.1, 1.23])
plt.grid(axis='x')
plt.xticks([1920,1940,1960,1980,2000], ['1920-1940', '1940-1960', '1960-1980','1980-2000', '2000-'])
plt.title('Proportion of sitters,\nMales and Females, 20-year periods')
plt.show()

_images/ec2b83cb3be3e9ee259218ef9182ac099579544855e4588ae9d33b0d6f4689eb.png

_images/29ea86da90cbc2a987fde3ebf5f834140c3f597b96a62cd8096fde25d7085acd.png

Winning age for Archibald winners#

We use a histogram chart to explore the distribution of winning age. The histogram exhibits a relatively bi-modal shape with some painters winning the Archibald prize much later in their career. However, the majority cluster around the mid-40s.

The youngest painter to win the Archibald Prize was Nora Heysen at the age of 27 years (1938) and the oldest being John Olsen wininng at the age of 77 years (2005).

Furthermore, we calculate the median winning age by gender of winning painter, and found that males (45) on average win later than females (39).

_images/b54573207e02225acf4f62ed5e4ecf8b2823232fe4e2f04b88053f7eda242bb7.png

Winning age by year#

The first line plot below shows the age of Archibald winners per year. At first glance, the winning age appears to fluctuate randomly, but there are some observable patterns prior to 1960. Upon closer examination, we discover that these gradual changes are the result of the same individuals winning the Archibald Prize multiple times.

We list five of the most frequent Archibald winners - all of which have more than three prizes.

Artist	Number of Archibald prizes
William Dargie	8
W B McInnes	7
John Longstaff	5
Ivor Hele	5
William Pidgeon	4

The second line plot emphasises on these five artists, highlighting some interesting insights.

The first 41 years of the Archibald prize were dominated by these multi-winners, specifcally winning more than two thirds (68.3%) of Archibald wins
W B McInnes and John Longstaff dominated the 1920-1940 period, collectively winning 12 out 19 Archibalds
William Dargie and Ivor Hele dominated the 1940-1960 period, collectively winning 13 out 20 Archibalds
We see a lot more distribution amongst painters in recent decades, with less occurence of repeat winners.

Show code cell source Hide code cell source

### plot winning age by year
fig, ax = plt.subplots(figsize=(10, 6))

plt.plot(archies['YEAR'], archies['winning_age'], alpha=0.35)
plt.plot(archies['YEAR'], archies['winning_age'], 
marker='o', linestyle='', color='tab:blue')

plt.axhline(y=44, color='red', linestyle='--', lw=1.5, alpha=0.3)

plt.ylim([20, 90])
plt.title('Age at time of Archibald Prize win by year, Median = 44')
plt.show()

############################################

### plot winning age by year and highlight multi-winners
fig, ax = plt.subplots(figsize=(10, 6))

plt.plot(archies['YEAR'], archies['winning_age'], alpha=0.35)
plt.axhline(y=44, color='red', linestyle='--', lw=1.5, alpha=0.3)

### William Dargie
cond = (archies['WINNER'] == 'William Dargie')
plt.plot(archies[cond]['YEAR'], archies[cond]['winning_age'], 
marker='o', linestyle='', color='tab:orange', label='William Dargie')

### W B McInnes
cond2 = (archies['WINNER'] == 'W B McInnes')
plt.plot(archies[cond2]['YEAR'], archies[cond2]['winning_age'], 
marker='o', linestyle='', color='tab:purple', label='W B McInnes')

### John Longstaff
cond3 = (archies['WINNER'] == 'John Longstaff')
plt.plot(archies[cond3]['YEAR'], archies[cond3]['winning_age'], 
marker='o', linestyle='', color='tab:pink', label='John Longstaff')

### Ivor Hele
cond4 = (archies['WINNER'] == 'Ivor Hele')
plt.plot(archies[cond4]['YEAR'], archies[cond4]['winning_age'], 
marker='o', linestyle='', color='tab:green', label='Ivor Hele')

### William Pidgeon
cond5 = (archies['WINNER'] == 'William Pidgeon')
plt.plot(archies[cond5]['YEAR'], archies[cond5]['winning_age'], 
marker='o', linestyle='', color='tab:red', label='William Pidgeon')

cond_rest = (archies['WINNER'] != 'William Dargie') & (archies['WINNER'] != 'W B McInnes') & \
    (archies['WINNER'] != 'John Longstaff') & (archies['WINNER'] != 'Ivor Hele') & \
    (archies['WINNER'] != 'William Pidgeon')
plt.plot(archies[cond_rest]['YEAR'], archies[cond_rest]['winning_age'], 
marker='o', linestyle='', color='tab:blue', label='Rest of winners')

# adjust legend
ax.legend(loc="upper right", ncol=3)
plt.title('Age at time of Archibald Prize win by year,\n\n')

# add subtitle
plt.text(0.5, 1.05, 'Artists who have won the Archibald Prize more than thrice are highlighted', 
horizontalalignment='center', verticalalignment='center', 
transform=ax.transAxes, fontsize=10)

plt.ylim([20, 90])
plt.show()

_images/67f1e20eb29822dccaf4e01a7b9a9d1a01e0d67d48fd4f0bcf6e54ff8b149c3a.png

_images/ce364733fbb49b8a6b27eeaa6ac244d49ebc159e912a85346aa4b86cad3843d1.png

Winning age for Archibald winners (cont.)#

To consider multi-winners, we assess the average winning age at different milestones in relation to the Archibald Prize (1st win, 2nd winm, etc.). The bar plot shows a similar average (43.5) for first-time winners (highlighted in orange) when compared with the overall median (44). This is likely due to the fact that most artists have only won the prize once (62 artists).

When considering second wins, the average winning age increases to 48.5, but then decreases for subsequent wins. This pattern may be a result of small sample sizes, but also suggests that multi-winners tend to experience early success. The only exception is John Longstaff, who won all his prizes after the age of 64.

Interestingly, William Dargie, who won his eighth and final Archibald Prize, was 44 years old, which is the same as the overall median winning age.

_images/bc2c26bb476223dd2de55dbaba9c72e80e0107dc08085492ecc2a6a82ae0fed1.png

Show code cell source Hide code cell source

# def upper_rugplot(data, height=.05, ax=None, **kwargs):
#     from matplotlib.collections import LineCollection
#     ax = ax or plt.gca()
#     kwargs.setdefault("linewidth", 1)
#     segs = np.stack((np.c_[data, data],
#                      np.c_[np.ones_like(data), np.ones_like(data)-height]),
#                     axis=-1)
#     lc = LineCollection(segs, transform=ax.get_xaxis_transform(), **kwargs)
#     ax.add_collection(lc)

# sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0), 'axes.linewidth':2})  
# palette = sns.color_palette("Paired", 8)    
# archies_density = archies[archies['count'] < 9].copy()  
# archies_density['count_verbose'] = np.where(archies_density['count'] == 1, '1st win', np.nan)  
# archies_density['count_verbose'] = np.where(archies_density['count'] == 2, '2nd win', archies_density['count_verbose'])  
# archies_density['count_verbose'] = np.where(archies_density['count'] == 3, '3rd win', archies_density['count_verbose'])  
# archies_density['count_verbose'] = np.where(archies_density['count'] == 4, '4th win', archies_density['count_verbose'])  
# archies_density['count_verbose'] = np.where(archies_density['count'] == 5, '5th win', archies_density['count_verbose'])  
# archies_density['count_verbose'] = pd.Categorical(archies_density['count_verbose'],   
# categories=['5th win','4th win','3rd win','2nd win','1st win'], ordered=True)    
# g = sns.FacetGrid(archies_density, palette=palette, row="count_verbose", hue="count_verbose", aspect=8, height=1.2)  
# g.map_dataframe(sns.kdeplot, x="winning_age", fill=True, alpha=0.9)  
# g.map_dataframe(sns.kdeplot, x="winning_age", color='black')    
# upper_rugplot(archies_density[archies_density['count'] == 8]['winning_age'], color=palette[7], linewidth=2.75, height=0.24, ax=g.axes[1,0])
# upper_rugplot(range(25,85), color='white', linewidth=3, height=.21, ax=g.axes[1,0])  
# upper_rugplot(archies_density[archies_density['count'] == 7]['winning_age'], color=palette[6], linewidth=2.75, height=0.21, ax=g.axes[1,0])
# upper_rugplot(range(25,85), color='white', linewidth=3, height=.18, ax=g.axes[1,0]) 
# upper_rugplot(archies_density[archies_density['count'] == 6]['winning_age'], color=palette[5], linewidth=2.75, height=0.18, ax=g.axes[1,0])
# upper_rugplot(range(25,85), color='white', linewidth=3, height=.15, ax=g.axes[1,0]) 
# upper_rugplot(archies_density[archies_density['count'] == 5]['winning_age'], color=palette[0], linewidth=2.75, height=0.15, ax=g.axes[1,0])  
# upper_rugplot(range(25,85), color='white', linewidth=3, height=.12, ax=g.axes[1,0])  
# upper_rugplot(archies_density[archies_density['count'] == 4]['winning_age'], color=palette[1], linewidth=2.75, height=.12, ax=g.axes[1,0])  
# upper_rugplot(range(25,85), color='white', linewidth=3, height=.09, ax=g.axes[1,0])  
# upper_rugplot(archies_density[archies_density['count'] == 3]['winning_age'], color=palette[2], linewidth=2.75, height=.09, ax=g.axes[1,0])  
# upper_rugplot(range(25,85), color='white', linewidth=3, height=.06, ax=g.axes[1,0])  
# upper_rugplot(archies_density[archies_density['count'] == 2]['winning_age'], color=palette[3], linewidth=2.75, height=.06, ax=g.axes[1,0])  
# upper_rugplot(range(25,85), color='white', linewidth=3, height=.03, ax=g.axes[1,0])  
# upper_rugplot(archies_density[archies_density['count'] == 1]['winning_age'], color=palette[4], linewidth=2.75, height=.03, ax=g.axes[1,0])    

# def label(x, color, label):      
#     ax = plt.gca()      
#     ax.text(0.9, .1, label, color=color, fontsize=13,      
#     ha="left", va="center", transform=ax.transAxes)    
    
# g.map(label, "count_verbose")  
# g.fig.subplots_adjust(hspace=-0.8)  
# g.set_titles("")  
# g.set(yticks=[], xlabel="", ylabel="", ylim=[0, 0.045])  
# g.despine( left=True)    
# plt.suptitle('Distribution of winning age at different milestones', x=0.52, y=0.9)  
# plt.show()

# majority of the below visualisation was constucted in python
# however, extra editing was done outside of python to get the final image
from IPython.display import Image
Image(filename='images/images_analysis/StackedDensity.png', width=800)

_images/d9fe328d7be826ebe89373a9d3fb8698ca8db50108a27a00a74b00854d3203fd.png

By the time the average participant achieves their first Archibald Prize, William Dargie had already secured his eighth Archibald win.

Winning age for Archibald winners by vicennium#

By analysing the winning age data by milestone and decade, we can observe that the average winning age for first-time winners has experienced fluctuations over time. During the 1920-1940 period, the median winning age for first-time winners was 35. However, this average rose to 46.5 over the next forty years and then dropped back to 40 in the 2000s. A similar pattern was observed for second-time winners, with a peak median of 60.5 in the 1980-2000 period.

As illustrated in previous visualisations, third-time winners and beyond tend to occur more often in earlier decades. The last artist to win three Archibald prizes was Eric John Smith in 1982 at the age of 63.

_images/2decd89c96ced2b944f67448a0098ab08751ea8f1428908cbbafe94252c564db.png

Colour and Brightness#

The below visualisations explore the colour and brightness of winning Archibald portraits. We first discuss our methodology of colour extraction and colour averaging before exploring the results.

Colour averaging#

An image can be summarised down into one average color by taking the average color of all pixels in the image.

Each pixel in an image is represented by a combination of three color values: red, green, and blue (RGB).
The range of each value is usually from 0 to 255, representing the intensity of the color.

To compute the average color of an image, the RGB values of each pixel are first extracted and then averaged across all pixels in the image. The resulting values represent the average color of the image.

Example: An image is often represented as a two-dimensional array of numbers, where each number represents the intensity or color of a pixel. The size of the array corresponds to the dimensions of the image, and each element in the array corresponds to a pixel in the image.

For example, consider a simple grayscale image of size 4x4 pixels:

	0	1	2	3
0	10	15	20	25
1	30	35	40	45
2	50	55	60	65
3	70	75	80	85

In this case, the numbers in the array represent the intensity of each pixel, ranging from 0 (black) to 255 (white). For example, the pixel in the top-left corner of the image has an intensity of 10, while the pixel in the bottom-right corner has an intensity of 85.

For a color image, the array would have an extra dimension representing the color channels (usually red, green, and blue), resulting in a three-dimensional array. For example, a color image of size 4x4 pixels might be represented as:

	0	1	2	3
0	[10, 20, 30]	[15, 25, 35]	[20, 30, 40]	[25, 35, 45]
1	[30, 40, 50]	[35, 45, 55]	[40, 50, 60]	[45, 55, 65]
2	[50, 60, 70]	[55, 65, 75]	[60, 70, 80]	[65, 75, 85]
3	[70, 80, 90]	[75, 85, 95]	[80, 90, 100]	[85, 95, 105]

In this case, each pixel is represented by a three-element array, with each element representing the intensity of the red, green, and blue channels, respectively. For example, the pixel in the top-left corner of the image has a red intensity of 10, a green intensity of 20, and a blue intensity of 30.

Colour over time#

We first consider the average colour of winning portraits by decade.

_images/025f3881001c4d15658989a513615cb4ae0ae7eaa0592c48f85b45d271a0ed09.png

_images/ce4c8d0ba1aabda7b667e5f58d0a6e87fb64330d467e6a43118f11f9b87c1479.png

_images/c8d8a04fa522bf8949d08cc2bd02f3a62007a035a117451c0e30aa02b458c985.png

_images/8aae64b0df19689e6919e85dd5187e4b0efe09bc9956cee14cd5c3fbacf5120d.png

_images/2af5362eec7cc80be8bd97136190d144349b5710af4136ec499e10ff4c5f0b19.png

_images/3db9a89994b791fdf00c7cccbf6e3ca2aff7940952fbd809065580c4dbd954a9.png

_images/3e96e04baad42d723461497a92c4c0ce10d957702b5b5c7745a0748652364db8.png

_images/43967ba7fa1c28ea0a7b4da7e77a1af816af8df7a63335e6b336fae66fce68c9.png

_images/c0751b0b7ee72c45aa216b762558b06c8a3f6a98033b0efb78b3557da7e0480c.png

_images/10282d70d726f685fb2117e2aed2bdb5b101972e07ad17f8ac76a279b0ea32aa.png

_images/9a04c3ace398ee8722d55673479bfe689dbae999d0a378943e6f881e95d79a4c.png

Colour averaging and categorisation#

The below visualisations were constructed using python code which extracts and categorises colors from each image of the winning portrait. The code begins by importing various libraries, including PIL, OpenCV, and matplotlib, which are used for image manipulation and visualization, as well as external packages like extcolors and colormath for colour extraction and categorisation.

Step 1
The get_closest_color() function is used to determine the closest named colour to a given RGB value using the CIE2000 algorithm, which measures the difference between two colours in terms of their perceived similarity. The colour names and corresponding RGB values are stored in a color_map dictionary, which is created using the webcolors library.

The CIE2000 algorithm is named after the International Commission on Illumination (CIE), an organisation that sets standards for colour measurement and specification. The algorithm takes into account how colours are perceived by the human eye, including factors like brightness, saturation, and hue. It was developed to improve upon earlier colour difference formulas, which were found to be inadequate for accurately describing how colours appear to the human eye.

Step 2
Next, the code reads in a list of image file names from a directory and uses the extcolors library to extract the dominant colours from each image. The extracted colours are then processed using the color_to_df() function to create a dataframe of colour codes, occurrence frequencies, and proportions.

The code then ranks the colours by their proportion and assigns a colour name using the get_closest_color() function and the color_map dictionary. The color_map dictionary is essentially a collection of named colours and their corresponding RGB values. In this code, the color_map dictionary is created using the webcolors library, which provides a list of 147 named colours and their corresponding RGB values in a convenient format. The reason the color_map dictionary is useful is that it allows the code to assign a human-readable name to each colour it extracts from an image. For example, if the code extracts a colour that is very close to (255, 0, 0), it can look up that RGB value in the color_map dictionary and see that it corresponds to the colour name “red.” This makes it easier for humans to interpret and analyze the resulting data.

Step 3
Finally, the resulting colour data is concatenated into a single dataframe for further analysis.

Note we commented out the code for the colour extraction and categorisation process as it takes 3 minutes to run. Instead, we saved the resulting dataframe as a csv file and imported it into the notebook from the ACDE Github repository.

Show code cell source Hide code cell source

# # Takes 3 minutes to run

# import matplotlib.patches as patches
# import matplotlib.image as mpimg

# from PIL import Image
# from matplotlib.offsetbox import OffsetImage, AnnotationBbox

# # !pip install easydev                 #version 0.12.0
# # !pip install colormap                #version 1.0.4
# # !pip install opencv-python           #version 4.5.5.64
# # !pip install colorgram.py            #version 1.2.0
# # !pip install extcolors               #version 1.0.0
# # !pip install colormath               #version 3.0.0
# # !pip install webcolors               #version 1.11.1

# import cv2
# import extcolors

# from colormap import rgb2hex
# from colormath.color_objects import sRGBColor, LabColor
# from colormath.color_conversions import convert_color
# from colormath.color_diff import delta_e_cie2000
# import webcolors

# def get_closest_color(requested_color, color_map):
#     requested_color = sRGBColor(*requested_color)
#     requested_color = convert_color(requested_color, LabColor)

#     min_distance = float("inf")
#     closest_color = None
#     for color_name, color_rgb in color_map.items():
#         color = sRGBColor(*color_rgb)
#         color = convert_color(color, LabColor)
#         distance = delta_e_cie2000(requested_color, color)
#         if distance < min_distance:
#             min_distance = distance
#             closest_color = color_name

#     return closest_color

# color_map = {color_name: webcolors.name_to_rgb(color_name) for color_name in webcolors.CSS3_NAMES_TO_HEX.keys()}

# from os import listdir
# from os.path import isfile, join
# onlyfiles = [f for f in listdir('./images/ArchibaldWinners') if isfile(join('./images/ArchibaldWinners', f))]

# def color_to_df(input):
#     colors_pre_list = str(input).replace('([(','').split(', (')[0:-1]
#     df_rgb = [i.split('), ')[0] + ')' for i in colors_pre_list]
#     df_percent = [i.split('), ')[1].replace(')','') for i in colors_pre_list]
    
#     #convert RGB to HEX code
#     df_color_up = [rgb2hex(int(i.split(", ")[0].replace("(","")),
#                           int(i.split(", ")[1]),
#                           int(i.split(", ")[2].replace(")",""))) for i in df_rgb]
    
#     df = pd.DataFrame(zip(df_color_up, df_percent), columns = ['c_code','occurence'])
#     return df

# df_colors = pd.DataFrame(columns = ['c_code','occurence'])
# onlyfiles.sort()

# for f in onlyfiles:
#     colors_x = extcolors.extract_from_path('./images/ArchibaldWinners/' + f, 
#                                            tolerance = 12, limit = 25)
#     df_color = color_to_df(colors_x)
#     df_color['proportion'] = df_color['occurence'].astype(float) / df_color['occurence'].astype(float).sum()
#     df_color['rank'] = df_color['proportion'].rank(ascending=False)
#     df_color['color_name'] = df_color.c_code.\
#         apply(lambda x: get_closest_color(webcolors.hex_to_rgb(x), color_map))
#     df_color['year'] = f[:4]
#     # df_colors = df_colors.append(df_color, ignore_index=True)
#     df_colors = pd.concat([df_colors, df_color], ignore_index=True)

# df_colors.to_csv('data/local/archies_colors.csv', index=False)          

# From github, fetch colour data for every Archibald winning potrait
# fetch data from github
df_colors = fetch_small_data_from_github("archies_colors.csv")

# display data
show(df_colors, scrollY="400px", scrollCollapse=True, scrollX=True,
     paging=False, showIndex=False, column_filters="footer", dom="tpr")

c_code	occurence	proportion	rank	color_name	year
Loading... (need help?)
c_code	occurence	proportion	rank	color_name	year

Show code cell source Hide code cell source

# create a new column for the year in 10 year intervals
df_colors['year_vicennium'] = df_colors['year'].astype(int).apply(lambda x: x - x % 20)
df_colors['year_vicennium'] = np.where(df_colors['year_vicennium'] == 2020, 2000, df_colors['year_vicennium'])

# create a new column for the proportion of colors in each year
len_20_cols = df_colors[df_colors['year_vicennium'] == 1920]['year'].nunique()
len_40_cols = df_colors[df_colors['year_vicennium'] == 1940]['year'].nunique()
len_60_cols = df_colors[df_colors['year_vicennium'] == 1960]['year'].nunique()
len_80_cols = df_colors[df_colors['year_vicennium'] == 1980]['year'].nunique()
len_00_cols = df_colors[df_colors['year_vicennium'] == 2000]['year'].nunique()
df_colors['proportion2'] = np.where(df_colors['year_vicennium'] == 1920, df_colors['proportion']/len_20_cols, np.nan)
df_colors['proportion2'] = np.where(df_colors['year_vicennium'] == 1940, df_colors['proportion']/len_40_cols, df_colors['proportion2'])
df_colors['proportion2'] = np.where(df_colors['year_vicennium'] == 1960, df_colors['proportion']/len_60_cols, df_colors['proportion2'])
df_colors['proportion2'] = np.where(df_colors['year_vicennium'] == 1980, df_colors['proportion']/len_80_cols, df_colors['proportion2'])
df_colors['proportion2'] = np.where(df_colors['year_vicennium'] == 2000, df_colors['proportion']/len_00_cols, df_colors['proportion2'])

for y in df_colors['year_vicennium'].unique():
    # get the top 5 colors for each year
    top5cols = df_colors[df_colors.year_vicennium == y]\
                .groupby(['year_vicennium','color_name'])\
                .agg({'proportion2':'sum'}).reset_index()\
                .sort_values(['year_vicennium','proportion2'], ascending=[True,False])\
                .groupby('year_vicennium')\
                .head(5)['color_name'].unique()

    df_colors_top5 = df_colors[df_colors.color_name.isin(top5cols)]\
                .groupby(['year_vicennium','color_name'])\
                .agg({'proportion2':'sum'}).reset_index()\
                .sort_values(['year_vicennium','proportion2'], ascending=[True,False])\
                .groupby('year_vicennium')\
                .head(5)

    df_colors_top5_pivot = df_colors_top5\
        .pivot(index='year_vicennium', columns='color_name', values='proportion2')\
        .fillna(0)

    # create a new column for the RGB values of each color
    df_colors_top5['red'] = df_colors_top5['color_name']\
        .apply(lambda x: int(CSS3_NAMES_TO_HEX[x][1:3], 16))
    df_colors_top5['green'] = df_colors_top5['color_name']\
        .apply(lambda x: int(CSS3_NAMES_TO_HEX[x][3:5], 16))
    df_colors_top5['blue'] = df_colors_top5['color_name']\
        .apply(lambda x: int(CSS3_NAMES_TO_HEX[x][5:], 16))

    # Define a list of column names to order by
    ordered_columns = df_colors_top5\
        .sort_values(['red','green','blue'], ascending=[True,True,True])['color_name']\
        .unique()

    # Reorder the columns
    df_colors_top5_pivot = df_colors_top5_pivot.reindex(columns=ordered_columns)

    # plot stacked area chart
    df_colors_top5_pivot\
        .plot.area(stacked=True, figsize=(10,5), 
        color=df_colors_top5_pivot.columns, 
        alpha=0.825)

    # add labels for sum of proportions for each year
    for i,x in enumerate(df_colors_top5_pivot.index):
        plt.text(x, df_colors_top5_pivot.iloc[i].sum()+0.05,
        f'{df_colors_top5_pivot.iloc[i].sum():.0%}', 
        ha='center', va='center', fontsize=12)

    plt.legend(loc='upper center', ncol=5, bbox_to_anchor=(0.5, 1.1))

    # Set the y-axis limts
    plt.ylim(0, 0.85)

    # Set the y-axis formatter to show percentages
    plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))

    plt.xlabel("")
    plt.grid(axis='x')
    plt.xticks([1920,1940,1960,1980,2000], 
    ['1920-1940', '1940-1960', '1960-1980','1980-2000', '2000-'])

    if y == 2000: plus20 = 2022
    else: plus20 = y + 20
    plt.title(f'Proportion of top five colors in Archibald winning portraits during {y}-{plus20}, 20-year periods\n\n')
    
    
    plt.show()

_images/c0787790001a00dc29424067f41d1d5b25ad5001735ff1c0dc71a81a68152a2a.png

_images/738c1d1e9a77086f4e9c70a8e3f188461ce01cdb130b152a720d45964a806a37.png

_images/de625ba4af714ec9516cd14e6b2cb1181cfce5d04ea4b0ba8fecd0409d809511.png

_images/3cdfdf4c4bf2f5ab182ada462144ce82b2f3c06c2b8c9f5ea2c0e94c39bad5f9.png

_images/d23fa5ac61a10c18790d784aece45c367c36bfb7cc8e05727856d1a55616add4.png

Show code cell source Hide code cell source

top30cols = df_colors\
            .groupby(['year_vicennium','color_name'])\
            .agg({'proportion2':'sum'}).reset_index()\
            .sort_values(['year_vicennium','proportion2'], ascending=[True,False])\
            .groupby('year_vicennium')\
            .head(5)['color_name'].unique()

top30cols_df = df_colors[df_colors.color_name.isin(top30cols)]\
                .groupby(['year_vicennium','color_name'])\
                .agg({'proportion2':'sum'}).reset_index()\
                .sort_values(['year_vicennium','proportion2'], ascending=[True,False])\
                .groupby('year_vicennium')\
                .head(100)

# plot the top 30 colors for each year as time series
fig, ax = plt.subplots(figsize=(8,6))

# line plot with markers at start and end of each line
sns.lineplot(x='year_vicennium', y='proportion2', hue='color_name', data=top30cols_df,
palette=sns.color_palette(top30cols_df.color_name.unique(), len(top30cols)), ax=ax,
alpha=0.6, linewidth=2.5)

# plot start and end markers
sns.scatterplot(x='year_vicennium', y='proportion2', hue='color_name', 
data=top30cols_df[(top30cols_df.year_vicennium == 1920) | (top30cols_df.year_vicennium == 2000)],
palette=sns.color_palette(top30cols_df.color_name.unique(), len(top30cols)), ax=ax,
s=50, marker='o', legend=False)

# no legend 
ax.legend().remove()
ax.set_title('Top colours over time')
ax.set_xlabel('')
ax.set_ylabel('')

plt.grid(axis='x', alpha=0.5)
plt.ylim(-.0275, 0.3)
plt.xticks([1905, 1920,1940,1960,1980,2000, 2012], 
['','1920-1940', '1940-1960', '1960-1980','1980-2000', '2000-',''])

# add annotation of color_name for each marker
ax.annotate('navy', (1913, 0.257), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='navy')
ax.annotate('midnight\nblue', (1913, 0.14), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='midnightblue')
ax.annotate('maroon', (1913, 0.12), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='maroon')
ax.annotate('saddle\nbrown', (1913, 0.07), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='saddlebrown')
ax.annotate('black', (1913, 0.055), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='black')
ax.annotate('olive', (2006, 0.015), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='olive')
ax.annotate('dark\ngreen', (1913, -0.0275), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='darkgreen')
ax.annotate('peru', (1913, 0.01), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='peru')
ax.annotate('tan', (1913, 0.027), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='tan')
ax.annotate('sienna', (1913, -.0025), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='sienna')
ax.annotate('thistle', (2006.5, 0.0325), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='thistle')
ax.annotate('dark\nslate\ngray', (2006.5, -0.025), textcoords="offset points", xytext=(0,10), ha='center', size=12, color='darkslategray')

# show plot
plt.show()

_images/4fec1a046daa4d32158d57de4853557334a307a3b2da6b1d4c0ff9d8fa9764ef.png

Brightness over time#

We next consider the average brightness of winning portraits over time.

Show code cell source Hide code cell source

# def brightness( im_file ):
#     im = Image.open(im_file)
#     stat = Stat(im)
#     r,g,b = stat.mean
#     return math.sqrt(0.299*(r**2) + 0.587*(g**2) + 0.114*(b**2))

# portraits_path = "./images/images_analysis/ArchibaldWinners"
# onlyfiles = [f for f in listdir(portraits_path) if isfile(join(portraits_path, f))]

# # sort image files in decade dictionary
# images_df = pd.DataFrame(onlyfiles)
# images_df['year'] = images_df[0].apply(lambda x: int(x.split('_')[0]))
# images_df.loc[images_df[0] == '1990_SID78808M.jpg.641x900_q85.jpg','year'] = 1991
# images_df['decade'] = [ int(np.floor(int(year)/10) * 10) 
#                        for year in np.array(images_df["year"])]
# images_df['brightness'] = images_df[0].apply(lambda x: brightness('./images/images_analysis/ArchibaldWinners/' + x))

# # create figure
# fig = plt.figure(figsize=(14, 8))
# ax = plt.axes()

# peaks = images_df[images_df['year'].isin([1921,1930,1936,1942,1947,1956,
#                                           1965, 1966,1978,1989,
#                                           2001,2002,2006,2014,2015,2022])]

# ax.plot(images_df.sort_values('year')['year'],
#        images_df.sort_values('year')['brightness'])

# ax.plot(peaks.sort_values('year')['year'],
#         peaks.sort_values('year')['brightness'], "o", color='#1f77b4')

# # Draw image
# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 1936].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1926,136,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 1942].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1935,163,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 1956].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1948,165,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/'+ \
#                        images_df[images_df.year == 1966].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1961,165,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 1989].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1982,199,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 1978].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1970,195,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 2002].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1994,210,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 2014].iloc[0][0], format='jpg')
# axin = ax.inset_axes([2007,205,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 1921].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1906,-30,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 2022].iloc[0][0], format='jpg')
# axin = ax.inset_axes([2021,40,15,65],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')


# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 1930].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1922,-46.5,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 1947].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1940,-35,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 1965].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1958,-30,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 2001].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1991,-20,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 2006].iloc[0][0], format='jpg')
# axin = ax.inset_axes([1999.5,-35,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# arr_image = plt.imread('./images/images_analysis/ArchibaldWinners/' + \
#                        images_df[images_df.year == 2015].iloc[0][0], format='jpg')
# axin = ax.inset_axes([2009.5,-35,15,55],transform=ax.transData)    # create new inset axes in data coordinates
# axin.imshow(arr_image)
# axin.axis('off')

# for tick in ax.xaxis.get_major_ticks(): tick.label1.set_fontsize(14)
# for tick in ax.yaxis.get_major_ticks(): tick.label1.set_fontsize(14)

# plt.title('Brightness over time, Archibald Winners', size=18)
# ax.set_ylim([-49.5, 280])
# ax.set_xlim([1905, 2035])
# plt.show()

from IPython.display import Image
Image(filename='images/images_analysis/Brightness_python.png')

_images/c7345ddb6125d0c29ae0c506fdadc84e4f3b63c0d1009bff4d15b4315dbd2826.png

Participation#

We explore participation rates over time, comparing the number of entries and the number of finalists. We also investigate the relationship between the number of entries and change in prize money.

Participation over time#

As shown in the visual below, the finalist conditions of the Archbald prize changed in 1947. For reference, the median number of finalists post-1947 is 31.5, while the median number of finalists pre-1947 is 86.

_images/296663d76efa8aaaf3d72ca06c797ee4d0a21d73578167d6bb319bc85177c06e.png

Prize Money#

It should be noted that we imported the prize money data from the ACDE Github repository. The data was originally sourced from financial reports of the Art Gallery of NSW. Temporal information of sponsors can be found across the AGNSW website for each respective Archibald prize.

_images/d8a012370873c7e7688d955dde355b2166511cd649bcc68a9d8624d47cf41992.png

Prize money before sponsors#

The visuals below show the prize money before sponsors are taken into account. The prize money was originally in pounds, but we have converted it to Australian dollars for ease of interpretation.

We compute the correlation between the number of entries and the change in prize money. We find that the correlation is 0.51, which is a moderate positive correlation. This suggests that the number of entries is positively associated with the change in prize money.

_images/80b0fde2c2fee29029e43d81f94a51ae6d92b9c55446c27919f978d4717fe186.png

_images/7b93ea9033ee1163cffd5b8571a42a0cbf2280765b9b81cc9603f4c6af722482.png

Prize money with sponsors (post-1980)#

The visuals below show the prize money after sponsors are taken into account. There have been six sponsors/partners since 1980.

We compute the correlation between the number of entries and the change in prize money during this period. We find that the correlation is 0.84, which is a strong positive correlation. In contrast to the pre-sponsor period, the correlation is much stronger, which suggests that the number of entries is more strongly associated with the change in prize money during this period.

_images/b5b82d533b6bd9312460e805251f88fee7560779d8b1f1b8beb252bce410050f.png

Show code cell source Hide code cell source

plt.figure(figsize=(16, 8))

# Set the x-axis to the year column
x = prize_money[prize_money.index >= 1981].index
x2 = no_participants[no_participants.Year >= 1981]['Year']

# Set the y-axis to the value column
y = prize_money[prize_money.index >= 1981]['AUD_Equivalent']
y2 = no_participants[no_participants.Year >= 1981]['Entries']


# Create a line plot of the data
ax = plt.subplot(2, 1, 1)
ax.plot(x, y, lw= 2, label='Prize Money')

plt.axvspan(1980, 1986, alpha=0.05, color='orange')
plt.axvspan(1986, 1988, alpha=0.05, color='yellow')
plt.axvspan(1988, 1992, alpha=0.05, color='green')
plt.axvspan(1992, 2006, alpha=0.05, color='red')
plt.axvspan(2006, 2009, alpha=0.05, color='green')
plt.axvspan(2009, 2023, alpha=0.05, color='blue')

# Format the y-axis labels as a monetary amount
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('${x:,.0f} AUD'))

plt.yticks(size=14)
plt.xticks(size=14)

plt.ylim(0,110000)
plt.xlim(1980.1,2023)

beforesponsors = pd.merge(no_participants[no_participants.Year >= 1981], 
                          prize_money[prize_money.index >= 1981].reset_index())

cor = beforesponsors['Entries'].corr(beforesponsors['AUD_Equivalent']).round(2)

# Add labels and a title
plt.xlabel('')
plt.title(f'Archibald prize money and number of entries by year,\nSponsors/Partners era (1981-), Corr: {cor}  ', size=22)

# Format the y-axis labels as a monetary amount
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('${x:,.0f} AUD'))

plt.yticks(size=14)
plt.xticks(size=14)
plt.grid(axis='x')

# add y-axis label, change angle
plt.legend(title='', loc='upper left', fontsize=14)

ax2 = plt.subplot(2, 1, 2)
ax2.plot(x2, y2, color = 'tab:orange', lw= 2, label='Number of entries')

plt.axvspan(1980, 1986, alpha=0.05, color='orange')
plt.axvspan(1986, 1988, alpha=0.05, color='yellow')
plt.axvspan(1988, 1992, alpha=0.05, color='green')
plt.axvspan(1992, 2006, alpha=0.05, color='red')
plt.axvspan(2006, 2009, alpha=0.05, color='green')
plt.axvspan(2009, 2023, alpha=0.05, color='blue')

plt.yticks(size=14)
plt.xticks(size=14)
plt.ylim(0,1190)
plt.grid(axis='x')

plt.xlim(1980.1,2023)

# add y-axis label, change angle
plt.legend(title='', loc='upper left', fontsize=14)

# save figure
# plt.savefig('correlation.png', dpi=330, bbox_inches='tight')

# Show the plot
plt.show()

_images/3fabf78e9d5244ff78a2ac7482ad56aeca7b18b79b0190e208b2caa7a3fc2048.png

Participant characteristics#

Below we output some summary statistics of the Archibald participant data. the main findings are listed below:

There are 62 distinct winners of the Archibald prize.
On average, winning painters begin participating in the Archibald prize at the age of 35 years and stop participating at the age of 61 years.
While most winning painters participate in the Archibald prize for roughly 10 to 30 years, there are some outliers who participate for much longer. For example, Joshua Smith participated for 64 years, with his first entry at the age of 19 and last entry at the age of 81.

	count	mean	std	min	25%	50%	75%	max
Age at first participation	62.0	38.032258	10.850652	19.0	31.0	35.0	45.0	64.0

	count	mean	std	min	25%	50%	75%	max
Age at last participation	62.0	59.032258	13.047163	33.0	48.0	61.0	68.75	81.0

	count	mean	std	min	25%	50%	75%	max
Diff. between first and last	62.0	21.0	14.990708	0.0	9.5	16.5	31.75	62.0

Archibald Prize participation trajectory#

We visualise the participation trajectory of winners by illustrating a painter’s association with the Archibald over time, where time is reflected by their age. This allows us to compare the participation trajectory of winners normalised by age.

We order the winners by their age at their first win and highlight this first win by a dashed red line. In terms of the y-axis, Participation represents an Archibald participation (with no win) and Win represents an Archibald win. In other words, high peaks represent Archibald wins and low peaks represent Archibald participations (with no win). This representation allows us to see how often each artist participated in the Archibald prize. In most cases, this occurs in clusters.

_images/ed35e9f627187407b176ca569c70ab2ff460c1d9426c1072112ae2dc89d1aaf9.png

We also considered used a sankey diagram to visualise trajectories, however more efforts are needed to make the visual easier to interpret.

Who is in the portrait?#

We investigate sitter characteristics of winning portraits such as their occupation and/or relationship with painter. We use the ANZSCO classification system to categorise the occupation of the sitter. The ANZSCO classification system is a skill-based classification system used to classify all occupations and jobs in the Australian and New Zealand labour markets.

Below we see the proportions of winning portraits by sitter occupation. The most common occupation is Arts and Media Professionals (58%).

Arts and Media Professionals                                0.578431
Chief Executives, General Managers and Legislators          0.127451
Uncategorised                                               0.107843
Design, Engineering, Science and Transport Professionals    0.039216
Education Professionals                                     0.029412
Health Professionals                                        0.029412
Legal, Social and Welfare Professionals                     0.029412
Protective Service Workers                                  0.019608
Sports and Personal Service Workers                         0.019608
Hospitality, Retail and Service Managers                    0.019608
Name: ANZSCO_1, dtype: float64

_images/ff187f98c0f7b09ec6abb83686a166ba1a85087ee207b371588bea6dce629920.png

Show code cell source Hide code cell source

archies['Decade'] = [ int(np.floor(int(year)/10) * 10) 
                       for year in np.array(archies["YEAR"])]

archies['ANZSCO_1'].fillna('Family/Friend', inplace=True)
archies['ANZSCO_1_v2'] = np.where(archies['ANZSCO_1'].isin(['Arts and Media Professionals',
                                                            'Chief Executives, General Managers and Legislators',
                                                            'Uncategorised']),archies['ANZSCO_1'],'Other')
t1 = pd.crosstab(archies['Decade'],archies['ANZSCO_1_v2'])

ax = t1.plot(linewidth=2, alpha=0.6)
plt.legend(ncol=4, bbox_to_anchor=(1, 1.1))
t1.plot(marker="o", markersize=6, alpha=0.9, ax=ax, linewidth=0, color=['#1f77b4','#ff7f0e','#2ca02c','#d62728'], legend=None)

# remove items from legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[:4], labels=labels[:4], ncol=2, bbox_to_anchor=(0.5, 1.175), 
          fontsize=10.5, loc='upper center', frameon=False)

# add grid lines for every 10 years
plt.grid(axis='x', alpha=0.5)

# add vertical lines for every 10 years
plt.axvline(1930, color='grey', linestyle='-', lw=1, alpha=0.2)
plt.axvline(1950, color='grey', linestyle='-', lw=1, alpha=0.2)
plt.axvline(1970, color='grey', linestyle='-', lw=1, alpha=0.2)
plt.axvline(1990, color='grey', linestyle='-', lw=1, alpha=0.2)
plt.axvline(2010, color='grey', linestyle='-', lw=1, alpha=0.2)

plt.title('Occupation of subjects by decade\n', size=14, pad=35)

# increae y-axis tick size
plt.yticks(size=12)
plt.xticks(size=11)

# increase x-axis title size
plt.xlabel('Decade', size=12)

# add annotations to the plot
plt.annotate('(2)', xy=(1930, 1), xytext=(1930, 5.5), size=12, color='tab:blue', ha='center')
plt.annotate('(1)', xy=(1950, 1), xytext=(1950, 5.5), size=12, color='tab:blue', ha='center')
plt.annotate('(2)', xy=(1970, 1), xytext=(1970, 4.75), size=12, color='tab:blue', ha='center')
plt.annotate('(1)', xy=(1980, 1), xytext=(1980, 6.5), size=12, color='tab:blue', ha='center')
plt.annotate('(3)', xy=(1990, 1), xytext=(1990, 8.5), size=12, color='tab:blue', ha='center')
plt.annotate('(2)', xy=(2000, 1), xytext=(2000, 10.5), size=12, color='tab:blue', ha='center')
plt.annotate('(2)', xy=(2010, 1), xytext=(2010, 8.5), size=12, color='tab:blue', ha='center')

# add box for annotation
plt.annotate('Self portrait occurrences are provided\nin brackets for relevant decades.', xy=(1918, 1), xytext=(1918, 10.25), 
             size=12, color='grey', ha='left', alpha=0.7, bbox=dict(boxstyle='round', fc='white', ec='white', alpha=0.8))


# increase y-axis
plt.ylim(-1, 11.75)

# plt.savefig('subject_occupation_by_decade.png', dpi=330, bbox_inches='tight')

plt.show()

_images/fbd335233d1af651581cbe605083883c28a41a26372eb1e9cbcc9a654e3cd242.png

Online presence of recent winners#

Lastly, we investigate the online presence of recent winners through quantifed measures extracted from Google Trends data. Numbers on the y-axis represent search interest relative to the highest point on the chart for the given region and time. A value of 100 is the peak popularity for the term. A value of 50 means that the term is half as popular. A score of 0 means that there was not enough data for this term.

We focus on the five most recent winners.

2018: Yvette Coppersmith
2019: Tony Costa
2020: Vincent Namatjira
2021: Peter Wegner
2022: Blak Douglas

Show code cell source Hide code cell source

# read data from github
googletrends = fetch_small_data_from_github('archies_Last5winners.csv')

sns.set(style='white', context='paper', rc={'figure.figsize':(12, 5)})

# plot a time series with Interest as the y-axis and x-axis in years
ax = googletrends.plot(x='Month', y='Tony Costa', color='tab:orange', linewidth=2, zorder=1, alpha=0.7)
googletrends.plot(x='Month', y='Blak Douglas', color='tab:blue', linewidth=2, zorder=1, ax=ax, alpha=0.7)
googletrends.plot(x='Month', y='Vincent Namatjira', color='tab:red', linewidth=2, zorder=1, ax=ax, alpha=0.7)
googletrends.plot(x='Month', y='Peter Wegner', color='tab:green', linewidth=2, zorder=1, ax=ax, alpha=0.7)
googletrends.plot(x='Month', y='Yvette Coppersmith', color='tab:purple', linewidth=2, zorder=1, ax=ax, alpha=0.7)

# add point hollow
ax.scatter(googletrends[googletrends['Month'] == '2018-05'].index.values[0],
           googletrends[googletrends['Month'] == '2018-05']['Yvette Coppersmith'].values[0]-.5,
           s=20, color='tab:purple', zorder=2)

ax.scatter(googletrends[googletrends['Month'] == '2019-05'].index.values[0],
           googletrends[googletrends['Month'] == '2019-05']['Tony Costa'].values[0]-.5,
           s=20, color='tab:orange', zorder=2)

ax.scatter(googletrends[googletrends['Month'] == '2020-09'].index.values[0]-.0345,
           googletrends[googletrends['Month'] == '2020-09']['Vincent Namatjira'].values[0]-.5,
           s=20, color='tab:red', zorder=2)

ax.scatter(googletrends[googletrends['Month'] == '2021-06'].index.values[0],
           googletrends[googletrends['Month'] == '2021-06']['Peter Wegner'].values[0]-.5,
           s=20, color='tab:green', zorder=2)

ax.scatter(googletrends[googletrends['Month'] == '2022-05'].index.values[0]-.025,
           googletrends[googletrends['Month'] == '2022-05']['Blak Douglas'].values[0]-.5,
           s=20, color='tab:blue', zorder=2)

# add source annotation in bottom right corner
plt.annotate('Source: Google Trends', xy=(0.0125, .925), xycoords='axes fraction', fontsize=12, color='#555555', zorder=2)

# add source annotation in bottom right corner
plt.annotate("May\n'18", xy=(googletrends[googletrends['Month'] == '2018-05'].index.values[0]/googletrends.shape[0], 0.91), 
             xycoords='axes fraction', fontsize=10, color='#555555', ha='center')

# add source annotation in bottom right corner
plt.annotate("May\n'19", xy=((googletrends[googletrends['Month'] == '2019-05'].index.values[0]-0.5)/googletrends.shape[0], 0.91), 
             xycoords='axes fraction', fontsize=10, color='#555555', ha='center')

# add source annotation in bottom right corner
plt.annotate("Sept\n'20", xy=((googletrends[googletrends['Month'] == '2020-09'].index.values[0]-2)/googletrends.shape[0], 0.91), 
             xycoords='axes fraction', fontsize=10, color='#555555', ha='center')

# add source annotation in bottom right corner
plt.annotate("June\n'21", xy=((googletrends[googletrends['Month'] == '2021-06'].index.values[0]-3)/googletrends.shape[0], 0.91), 
             xycoords='axes fraction', fontsize=10, color='#555555', ha='center')

# add source annotation in bottom right corner
plt.annotate("May\n'22", xy=((googletrends[googletrends['Month'] == '2022-05'].index.values[0]-3.75)/googletrends.shape[0], 0.91), 
             xycoords='axes fraction', fontsize=10, color='#555555', ha='center')

# shade plot for 2018
plt.axvspan(googletrends[googletrends['Month'] == '2018-03'].index.values[0], 
            googletrends[googletrends['Month'] == '2018-07'].index.values[0], 
            color='tab:purple', alpha=0.025, zorder=3)

# shade plot for 2019
plt.axvspan(googletrends[googletrends['Month'] == '2019-03'].index.values[0], 
            googletrends[googletrends['Month'] == '2019-07'].index.values[0], 
            color='tab:orange', alpha=0.025, zorder=3)

# shade plot for 2020
plt.axvspan(googletrends[googletrends['Month'] == '2020-07'].index.values[0], 
            googletrends[googletrends['Month'] == '2020-11'].index.values[0], 
            color='tab:red', alpha=0.025, zorder=3)

# shade plot for 2021
plt.axvspan(googletrends[googletrends['Month'] == '2021-04'].index.values[0], 
            googletrends[googletrends['Month'] == '2021-08'].index.values[0], 
            color='tab:green', alpha=0.025, zorder=3)

# shade plot for 2022
plt.axvspan(googletrends[googletrends['Month'] == '2022-03'].index.values[0], 
            googletrends[googletrends['Month'] == '2022-07'].index.values[0], 
            color='tab:blue', alpha=0.025, zorder=3)

plt.xlabel('')
plt.grid(axis='y', alpha=0.5)
plt.gca().set_axisbelow(True)
plt.yticks(np.arange(0, 110, 25), fontsize=12)
plt.xticks(fontsize=12)

plt.ylim(0, 120)

plt.title('Google search term usage for the last 5 winners, 2013-2023\n\n', fontsize=16)
plt.legend(loc='upper center', fontsize=11.5, bbox_to_anchor=(0.5, 1.125), ncol=5)

# save figure
# plt.savefig('google_trends_last_five_winners.png', dpi=330, bbox_inches='tight')

plt.show()

# Numbers represent search interest relative to the highest point on the chart for the given region and time. 
# A value of 100 is the peak popularity for the term. A value of 50 means that the term is half as popular. 
# A score of 0 means that there was not enough data for this term."

_images/bde9d1c10c7c2a4148d6ba68c3f927d95619fd64bf4a17820e2ba32cb130ee07.png

Exploring success quantitatively using Archibald Prize data

Contents

Exploring success quantitatively using Archibald Prize data#

Import packages and pre-process data#

Gender distribution#

Male and female distribution for Archibald winners#

Male and female distribution of sitters for winning Archibald portraits#

Do males paint males?#

Male and female distribution over time#

Winning age for Archibald winners#

Winning age by year#

Winning age for Archibald winners (cont.)#

Winning age for Archibald winners by vicennium#

Colour and Brightness#

Colour averaging#

Colour over time#

Colour averaging and categorisation#

Brightness over time#

Participation#

Participation over time#

Prize Money#

Prize money before sponsors#

Prize money with sponsors (post-1980)#

Participant characteristics#

Archibald Prize participation trajectory#

Who is in the portrait?#

Online presence of recent winners#