library(jsonlite)
library(tidyverse)
# Saved this xlsx from google sheets -- get ref info on geo and theme IDs
<- readxl::read_xlsx('manneken-pis-costumes-ref.xlsx',sheet='theme') %>%
ref_theme separate(name,into = c('trash','id','wip'),sep = "\"") %>%
separate(wip, into = c('trash2','theme','trash3','trash4'),sep=">|<") %>%
select(-starts_with('trash'))
<- readxl::read_xlsx('manneken-pis-costumes-ref.xlsx',sheet='geography') %>%
ref_geo separate(name,into = c('trash','id','wip'),sep = "\"") %>%
separate(wip, into = c('trash2','geo','trash3','trash4'),sep=">|<") %>%
select(-starts_with('trash'))
# Load JSONs from file, parse + merge into one df, with ref info attached
<- fromJSON("/Users/carobuck/Documents/fun_projects/manneken-pis/theme_to_costumes.json")
costume_theme <- fromJSON('/Users/carobuck/Documents/fun_projects/manneken-pis/location_to_costumes.json')
costume_location
# deal w/ themes
<- lapply(costume_theme, function(df) {
list_of_dfs_no_extra !names(df) %in% c("t_s")]
df[,
})<- imap(list_of_dfs_no_extra, ~ mutate(.x, id_theme = .y))
list_of_dfs_with_id <- bind_rows(list_of_dfs_with_id)
combined_df_costume
# deal w/ geo
<- lapply(costume_location, function(df) {
list_of_dfs_no_extra %>% mutate(body = as.character(body)) %>%
df select(-t_s) -> df
return(df)
})<- imap(list_of_dfs_no_extra, ~ mutate(.x, id_geo = .y))
list_of_dfs_with_id <- bind_rows(list_of_dfs_with_id)
combined_df_geo
# combine + clean up df
%>%
combined_df_costume full_join(combined_df_geo) %>%
full_join(ref_geo,by=join_by(id_geo==id)) %>%
full_join(ref_theme,by=join_by(id_theme==id)) %>%
mutate(t = str_trim(t),
body = str_trim(body)) %>%
distinct() -> clean_data
Manneken Pis Data Wrangling
Who is Manneken Pis? He’s got a lot of costumes, and I couldn’t find a dataset anywhere that listed all the costumes. I felt that one needed to exist.
Trying to get data from website, or premade database or scraping was a nightmare (selenium, Docker, UGH.) So, went hunting through HTML manually and found some long lists with all the costume info (both the full descriptions of the costumes, plus reference values for theme and geo location). Parsing/assembling together here.
Link to google sheet with ref info (downloaded as xlsx) here
All costumes found/scraped from this site
Get a glimpse of what the data look like now:
%>% glimpse() clean_data
Rows: 1,480
Columns: 7
$ id <chr> "3616", "3388", "3134", "3036", "3666", "3232", "3722", "2403…
$ t <chr> "\"Grand conseiller de la Confrérie Saint-Etienne\"", "Armail…
$ body <chr> "-1", "<p>Costume presented on the occasion of a chapter held…
$ id_theme <chr> "3304", "3304", "3304", "3304", "3304", "3304", "3304", "3304…
$ id_geo <chr> "3336", "3347", "3587", "3455", "3336", "3599", "3442", "3335…
$ geo <chr> "France", "Switzerland", "Pauillac", "Schaerbeek", "France", …
$ theme <chr> "Gastronomy", "Gastronomy", "Gastronomy", "Gastronomy", "Gast…
Make some quick charts from the data
library(treemapify)
%>%
clean_data #count(geo,theme) %>%
count(theme) %>%
filter(!is.na(theme)) %>%
ggplot(aes(y=n,x=fct_rev(fct_reorder(theme,n)))) +
geom_col(fill='cyan4',color='gray30') +
geom_label(aes(label=n),hjust = -0.1) +
coord_flip() +
theme_bw() +
labs(y='Number of Costumes',x='',title='Costumes by Themed Category') +
theme(axis.text = element_text(size=11))
%>%
clean_data #count(geo,theme) %>%
count(geo) %>%
filter(n>=10) %>%
ggplot(aes(area = n, fill = geo, label = paste(n,geo))) +
geom_treemap(color='white') +
geom_treemap_text(colour = "black",
place = "centre",
size = 15) +
theme(legend.position = 'none') +
labs(title='Locations with at least 10 Outfits')
Get stats for total number of locations and # themes
%>%
clean_data count(theme)
theme n
1 Arts 4
2 Brussels municipalities 7
3 Carnival 37
4 Characters 25
5 Charitable associations 34
6 Cinema 8
7 Education 64
8 Environment 13
9 Fashion and textiles 20
10 Feminism 1
11 Folklore Associations 228
12 Folklore Brussels 19
13 Folklore Students 42
14 Gastronomy 93
15 Health and Medecine 25
16 Historical Costumes 37
17 Human rights 1
18 Humanitarian 24
19 Institutions 11
20 International organisations 2
21 LGBTQIA+ 2
22 Leisure 42
23 Military 65
24 Music 60
25 Older Costumes 6
26 Performance 47
27 Personalities 39
28 Professions 153
29 Radio and television 3
30 Sports 104
31 Traditional Costumes 200
32 Youth associations 19
33 <NA> 45
%>%
clean_data count(geo)
geo n
1 Aalst 3
2 Africa 4
3 Albania 1
4 Alkmaar 2
5 Amersfoort 3
6 Amsterdam 4
7 Anderlecht 5
8 Antwerp 4
9 Antwerpen 1
10 Appollonia 2
11 Arendonk 1
12 Arezzo 1
13 Argenteuil 2
14 Argentina 3
15 Arles 1
16 Arlon 4
17 Armenia 1
18 Arolsen 1
19 Asia 4
20 Asse 1
21 Ath 1
22 Auch 1
23 Audergem 1
24 Austria 5
25 Azerbaijan 1
26 Basel 2
27 Bassoles-Aulers 2
28 Bastogne 4
29 Beauvechain 1
30 Belgium 345
31 Bergen op Zoom 3
32 Biarritz 2
33 Bilbao 1
34 Binche 5
35 Bolivia 2
36 Bombay 1
37 Bouillon 2
38 Brabant-Wallon 1
39 Braine le Château 2
40 Brasschaat 2
41 Brazil 4
42 Bredene 2
43 Brussels 267
44 Brussels Capital 4
45 Budapest 2
46 Bulgaria 2
47 Burundi 2
48 Canada 8
49 Caughnawaga 1
50 Central Africa 1
51 Central America 1
52 Charleroi 2
53 Chaudfontaine 2
54 Chile 1
55 China 7
56 Chièvres 1
57 Châtelet 1
58 Cilaos 1
59 Ciney 2
60 Ciociari 1
61 Colmar 1
62 Cologne 4
63 Colombia 2
64 Comines-Warneton 1
65 Condom-en-Armagnac 2
66 Congo 7
67 Costa Rica 1
68 Couvin 1
69 Coxyde 2
70 Croatia 1
71 Cuba 2
72 Cyprus 1
73 Czechoslovakia 1
74 Dallas 1
75 Dax 1
76 Denmark 6
77 Digne 1
78 Dinant 6
79 Dominican Republic 1
80 Douai 2
81 Dublin 1
82 Dubrava 1
83 Duisburg 2
84 Düsseldorf 1
85 Ecuador 3
86 Eeklo 1
87 Eigenbrakel 2
88 El Salvador 1
89 Ellezelles 3
90 Estaimpuis 2
91 Estonia 2
92 Etterbeek 1
93 Eupen 2
94 Europe 16
95 Evere 2
96 Finland 4
97 Fleurus 2
98 Florence 1
99 Florenville 1
100 Fontainebleau 2
101 Forchie-la-Marche 2
102 Fosse 5
103 France 71
104 Fécamp 2
105 Gdansk 1
106 Geel 1
107 Gembloux 2
108 Geneva 1
109 Genk 1
110 Georgia 3
111 German Democratic Rep. 1
112 Germany 3
113 Gerpinnes 1
114 Ghent 5
115 Gijon 1
116 Grammont 4
117 Grasse 1
118 Greece 3
119 Groningen 1
120 Gruyère 1
121 Guatamala 1
122 Gussignies 2
123 Haaksbergen 1
124 Hainaut 1
125 Ham-sur-Heure 1
126 Haren 4
127 Hasselt 4
128 Hawaï 1
129 Helsinki 2
130 Henegouwen 1
131 Herquegies 1
132 Herseaux 1
133 Hirson 1
134 Hoeilaart 1
135 Honduras 1
136 Honolulu 2
137 Hungary 2
138 Hélécine 1
139 Ibiza 1
140 India 2
141 Indonesia 2
142 International 26
143 Ireland 1
144 Israel 3
145 Italy 20
146 Ittre 2
147 Jamaica 1
148 Jambes 1
149 Japan 12
150 Jargeau 2
151 Jerez de la Frontera 2
152 Jerusalem 2
153 Jette 1
154 Jodoigne 2
155 Jumet 2
156 Kazakhstan 1
157 Kelmis 1
158 Kenya 2
159 Korea 2
160 Kuwait 1
161 Kyrgyzstan 1
162 L'Hay-les-Roses 2
163 La Flèche 2
164 La Louvière 1
165 La Roche en Ardenne 2
166 Laken 1
167 Latvia 1
168 Le Roeulx 2
169 Libanon 1
170 Lille 4
171 Limburg 1
172 Limelette 2
173 Lisbon 1
174 Lithuania 2
175 Liège 17
176 Llanes 2
177 Lleida 3
178 Lodelinsart 2
179 Los Angeles 1
180 Louvain-la-Neuve 1
181 Lucca 2
182 Lund 3
183 Luxembourg 1
184 Luxemburg 3
185 Lyon 2
186 Maastricht 2
187 Madrid 3
188 Mali 1
189 Malmedy 2
190 Malta 1
191 Marostica 1
192 Marquise Islands 2
193 Marville 2
194 Melle 2
195 Merchtem 4
196 Mexico 5
197 Mirande 2
198 Mol 1
199 Moldavia 1
200 Monaco 2
201 Moncrabeau 1
202 Mongolia 1
203 Mons 10
204 Montenegro 1
205 Montreal 2
206 Moorea 1
207 Morocco 3
208 Moscow 1
209 Moulbaix 2
210 Mouscron 1
211 Moux 2
212 Munster 1
213 Namur 6
214 Neder-Over-Heembeek 4
215 Neurenburg 2
216 New York 1
217 Nicaragua 1
218 Nice 1
219 Nieuwpoort 2
220 Niger 2
221 Nishinomiya 1
222 Nivelles 2
223 Nogent-sur-Marne 2
224 North America 1
225 Norway 4
226 Nuoro 1
227 Nîmes 2
228 Oceania 1
229 Oost-Vlaanderen 1
230 Oostduinkerke 2
231 Osaka 1
232 Ostend 2
233 Panama 1
234 Papantla 1
235 Paris 9
236 Pauillac 2
237 Peizegem 1
238 Perpignan 2
239 Peru 3
240 Philippines 1
241 Poitiers 1
242 Poland 10
243 Pora Pora 2
244 Portugal 6
245 Quaregnon 1
246 Quebec 1
247 Quimperlé 3
248 Reims 4
249 Renaix 1
250 Riga 1
251 Rochefort 3
252 Roeselare 2
253 Romania 6
254 Ronse 2
255 Russia 6
256 Rwanda 1
257 Saint-Quentin 1
258 Samur 1
259 Saragoza 1
260 Schaerbeek 10
261 Serbia 1
262 Singapore 2
263 Slovakia 1
264 Slovenia 1
265 South Africa 2
266 South America 1
267 South Korea 3
268 Souvret 2
269 Spa 2
270 Spain 15
271 Sri Lanka 1
272 St.-Cécile 1
273 St.-Emilion 2
274 St.-Gille 1
275 St.-Maxim 1
276 St.-Troyan 1
277 Stavelot 1
278 Strassburg 1
279 Sully-sur-Loire 1
280 Sweden 1
281 Switzerland 3
282 Tahiti 1
283 Taiwan 1
284 Tallin 1
285 Thailand 1
286 The Netherlands 12
287 Theux 2
288 Thieulain 1
289 Thuin 4
290 Tokyo 3
291 Tongeren 2
292 Toulouse 1
293 Tournai 5
294 Troyes 1
295 Tunisia 4
296 Turkey 2
297 Turnhout 3
298 Uccle 7
299 Ukraine 3
300 United Arab Emirates 3
301 United Kingdom 9
302 United States 22
303 Uruguay 2
304 Uzbekistan 1
305 Valencia 3
306 Venezuela 1
307 Verviers 2
308 Vienna 2
309 Viesalm 2
310 Vietnam 1
311 Villers-la-Ville 1
312 Virton 2
313 Virton-Lateau 2
314 Vlaams Brabant 1
315 Vlaanderen 1
316 Waimes 2
317 Wallonie 1
318 Wavre 2
319 Weert 1
320 West-Vlaanderen 1
321 Woluwe-st.-Lambert 3
322 Woluwé-St.-Pierre 1
323 Wolvertem 1
324 World 1
325 Wépion 2
326 Ypres 1
327 Yugoslavia 1
328 Yukata-Ori 1
329 Zelzate 2
330 <NA> 3
What would I do with more time?
- Clean up/consolidate geo locations (a lot of subregions, where are all of these places? correlation to theme?)
- NLP on costume title and description (very messy rough text fields currently)
- Background research on what some of the groups/titles mean (a lot I’ve never heard of)
- Prettier visualization; perhaps interactive with more data incorporated