Project Description

Data Import & Guidance

Now you have uncompressed aggregate and deaths folders in your local disk, each with 5 csv files all with headers. Now you need to create a database in your MySQL service (I assume you already installed one) called PUBG.

CREATE DATABASE PUBG;

Before you move on, here is the engineering part: you need to move the deault database storage location (default in/var/lib/mysql/) to another disk (where you have the largest space).

Step 1.Stop the service and copy your database to new folder (run in bash)

/etc/init.d/mysql stop
# Start becoming root to save the need to type all that sudos:
sudo su -
# Copy your data to your new folder
cp /var/lib/mysql/DATABASENAME /path/to/new/database/dir
# Remove your old dir from MySQL data files: (-R for recursive operation)
rm -R /var/lib/mysql/DATABASENAME
# Make a symlink back to MySQL data dir:
ln -s /full/path/to/new/database/dir /var/lib/mysql/DATABASENAME
# Check Ownership and Chmod: If you got denied, give permission to every parent dir
sudo -u mysql ls /new/path/yourdatabase
# Give permission
chmod 755  /new/path/yourdatabase

# Link the new dir location and give it the correct permissions:
ln -s /full/path/to/yourdatabase /var/lib/mysql/
chown mysql:mysql /var/lib/mysql/yourdatabase # change ownership
# Edit AppArmor local configuration file
vim /etc/apparmor.d/local/usr.sbin.mysqld
# add to end of the file: (Don't miss the last comma)
/new/path/yourdatabase/ r,
/new/path/yourdatabase/** rwk,
# Exit vim and make configuration effective:
apparmor_parser -r /etc/apparmor.d/usr.sbin.mysqld

# restart mysql server
/etc/init.d/mysql start

Now open mysql and check if the new database location is valid:

SHOW DATABASES;

After that you also need another configuration modification to avoid future error (I met one and try to save your time here): (bash)

## Error Type: The total number of locks exceeds the lock table size
#  Reference Link: https://stackoverflow.com/questions/6901108/the-total-number-of-locks-exceeds-the-lock-table-size
## Solution:
# Locate my.cnf file, for linux it's at /etc/mysql/my.cnf
vim /etc/mysql/my.cnf # for linux user only
# add following lines to end of file
[mysqld]
innodb_buffer_pool_size = 1G
# Restart server
/etc/init.d/mysql restart

Now we are good to create tables and import files in MySQL

-- create table 'meta' to store meta information for players
create table meta (date datetime, game_size int(3), match_id varchar(255), match_mode char(3), party_size int(1), player_assists int(3), player_dbno int(3), player_dist_ride float, player_dist_walk float, player_dmg int, player_kills int(3), player_name varchar(255), player_survive_time float, team_id int(3), team_placement int(3));

-- create table 'combat' to store combat information (every kill) for players
create table combat (killed_by varchar(255), killer_name varchar(255), killer_placement int(3), killer_position_x float, killer_position_y float, map varchar(255), match_id varchar(255), time int, victim_name varchar(255), victim_placement int(3), victim_position_x float, victim_position_y float);

Now import files from your local folder using bash command

# XXXXXX means your MySQL password
#  IGNORE 1 LINES because every csv file has a header we don't need
mysql -uroot -pXXXXXX --local_infile=1 PUBG -e "LOAD DATA LOCAL INFILE './aggregate/agg_match_stats_0.csv' INTO TABLE meta FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES";

mysql -uroot -pMelody24 --local_infile=1 PUBG -e "LOAD DATA LOCAL INFILE './aggregate/agg_match_stats_1.csv' INTO TABLE meta FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES";

mysql -uroot -pMelody24 --local_infile=1 PUBG -e "LOAD DATA LOCAL INFILE './aggregate/agg_match_stats_2.csv' INTO TABLE meta FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES";

mysql -uroot -pMelody24 --local_infile=1 PUBG -e "LOAD DATA LOCAL INFILE './aggregate/agg_match_stats_3.csv' INTO TABLE meta FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES";

mysql -uroot -pMelody24 --local_infile=1 PUBG -e "LOAD DATA LOCAL INFILE './aggregate/agg_match_stats_4.csv' INTO TABLE meta FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES";

mysql -uroot -pMelody24 --local_infile=1 PUBG -e "LOAD DATA LOCAL INFILE './deaths/kill_match_stats_final_0.csv' INTO TABLE combat FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES";

mysql -uroot -pMelody24 --local_infile=1 PUBG -e "LOAD DATA LOCAL INFILE './deaths/kill_match_stats_final_1.csv' INTO TABLE combat FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES";

mysql -uroot -pMelody24 --local_infile=1 PUBG -e "LOAD DATA LOCAL INFILE './deaths/kill_match_stats_final_2.csv' INTO TABLE combat FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES";

mysql -uroot -pMelody24 --local_infile=1 PUBG -e "LOAD DATA LOCAL INFILE './deaths/kill_match_stats_final_3.csv' INTO TABLE combat FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES";

mysql -uroot -pMelody24 --local_infile=1 PUBG -e "LOAD DATA LOCAL INFILE './deaths/kill_match_stats_final_4.csv' INTO TABLE combat FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES";

More directly, we don't need killer_position_x and killer_position_y such kind of variables in the combat table, but we'd like to record the shooting distance, we can transform our combat table accordingly in MySQL:

Warning! This process is useful but indeed takes a long time (several hours), recommend execute this before sleep, I use this transformed combat_summary table for table join and analysis afterwards

create table combat_summary (match_id varchar(255), map varchar(20), killer_name char(255), avg_shoot_range float, max_shoot_range float);

insert into combat_summary 
select a.match_id, a.map, a.killer_name, avg(a.shoot_range) as avg_shoot_range, max(a.shoot_range) as max_shoot_range 
from 
(select match_id, map, killer_name, sqrt(power(killer_position_x - victim_position_x, 2) + power(killer_position_y - victim_position_y, 2)) as shoot_range, time from combat) a 
group by killer_name, match_id, map;

Data Preparation

Here is how I use R to perform a SELECT COUNT(player_name) FROM meta statement in just a few minutes:

library(tidyverse)
library(parallel)
library(data.table)
# show player games stat, use fread to boost speed
read_pubg_file <- function(filename){
  fread(filename, showProgress = FALSE) %>%
    group_by(player_name) %>%
    summarise(n = n())
}

# list all 5 files in aggregate folder
filenames <- list.files("./aggregate/", full.names = TRUE)

# paralleled version of lapply(), On Unix-alikes mc.cores > 1 is allowed and uses parallel operations.
res <- mclapply(filenames, read_pubg_file, mc.cores = 4) %>%
  bind_rows() # use this to combine multiple dataframes

# there might be players appear in several read files, we need to group them again
df <- res %>%
  group_by(player_name) %>%
  summarise(n = sum(n)) %>%
  arrange(desc(n))# now use sum() instead

head(df)
# Now we'd like to create a one-column dataframe of selected players
player_games <- df[-1,] # delete the emplty name entries.
# extract their name variables
select_name <- player_games %>%
  filter(n >= 30) %>%
  select(player_name)

write_csv(select_name, "PUBG_over30games_playername.csv")

Then I used the same method to filter and group both meta and combat tables (corresponds to files in aggregate and deaths folders)

Below is my data tidying workflow:

  • For meta table (aggregate files)
  1. for each subfile, filter by select_name
  2. combine
  • For combat table (deaths files), we also need to compute the combat range info
  1. for each subfile, only select killer_name, killer_position_x, killer_position_y, map, match_id, victim_name, victim_position_x, victim_position_y
  2. filter by select_name and by killer_name != victim_name (don't consider suicides)
  3. filter by neither the (killer_position_x, killer_position_y) nor (victim_position_x, victim_position_y) is (0, 0) (see: https://www.kaggle.com/mithrillion/kill-distance-analysis for detailed reasons)
  4. mutate a new column as shoot_range, sqrt(x^2 + y^2), drop coordinate var
  5. combine
  6. group by match_id, map and killer_name, summarise as avg_shoot_range, max_shoot_range for each player per match
  • Inner join 2 tables by match_id and killer_name
# For meta table (aggregate files)
read_pubg_file <- function(filename){
  fread(filename, showProgress = FALSE) %>%
    filter(player_name %in% select_name$player_name)
}

filenames <- list.files("./aggregate/", full.names = TRUE)

# paralleled version of lapply(), On Unix-alikes mc.cores > 1 is allowed and uses parallel operations.
meta <- mclapply(filenames, read_pubg_file, mc.cores = 4) %>%
  bind_rows() # use this to combine multiple dataframes

# save image
save(meta, file = "meta_0826.RData")
# For combat table (deaths files)
read_pubg_file <- function(filename){
  fread(filename, showProgress = FALSE) %>%
    select(killer_name, killer_position_x, killer_position_y, map, match_id, victim_name, victim_position_x, victim_position_y) %>%
    filter(killer_name %in% select_name$player_name, killer_name != victim_name) %>%
    filter((victim_position_x != 0 | victim_position_y != 0), (killer_position_x != 0 | killer_position_y != 0)) %>%
    mutate(shoot_range = sqrt((killer_position_x - victim_position_x)^2 + (killer_position_y - victim_position_y)^2)) %>%
    select(match_id, map, killer_name, shoot_range, victim_name)
}

filenames <- list.files("./deaths/", full.names = TRUE)

# paralleled version of lapply(), On Unix-alikes mc.cores > 1 is allowed and uses parallel operations.
res <- mclapply(filenames, read_pubg_file, mc.cores = 4) %>%
  bind_rows() # use this to combine multiple dataframes

combat_range <- res %>%
    group_by(match_id, map, killer_name) %>%
    summarise(avg_shoot_range = mean(shoot_range), max_shoot_range = max(shoot_range))
agg_df <- meta %>%
  inner_join(combat_range, by = c("match_id", "player_name" = "killer_name"))

save(agg_df, file = "agg_df_0826.RData")

We can also upload and save the ultimate data onto AWS MySQL database (optional), you need to first create a .my.cnf file in your home folder (for Linux users), with following text:

For more tutorial on how to set up your AWS RDS (free for an entire year with 20GB trial instance), please read my blog Setup Your Own Remote Database on AWS

[PUBGinAWS]
database=PUBG
user=root
password=XXXXXX
host=Get This From AWS-->Connect Link

# save to AWS mySQL
library(RMariaDB)
library(DBI)

conPUBG <- dbConnect(RMySQL::MySQL(), group = "PUBGinAWS")
dbListTables(conPUBG)

RMySQL::dbWriteTable(conPUBG, "aggregate_df", agg_df)
dbListTables(conPUBG)
dbDisconnect(conPUBG)

Exploratory Data Analysis

Load table from AWS-RDS

library(RMariaDB)
library(DBI)
conPUBG <- dbConnect(RMariaDB::MariaDB(), host = "yourhostip", 
                 user = "root", password = "yourpassword", dbname = "PUBG", port = 3306)
dbListTables(conPUBG)

df_PUBG <- dbReadTable(conPUBG, "aggregate_df")
dbDisconnect(conPUBG) # make sure to disconnect a DB

Now let's do exploratory data analysis on these 484679 PUBG players.

library(tidyverse)
library(lubridate) # best package for dealing with datetime variables

df_PUBG <- agg_df # keep agg_df as archive file
df_PUBG <- as.tibble(df_PUBG)
df_PUBG

# change the datetime variable to fit for csv/excel time format, and create other time variables such as 'day of the week'
df_PUBG$datetime <- ymd_hms(df_PUBG$date)
df_PUBG$hour <- hour(df_PUBG$datetime)
df_PUBG$dayofweek <- wday(df_PUBG$datetime)
df_PUBG$date <- date(df_PUBG$datetime)

# save
save(df_PUBG, file = "df_PUBG_0826.RData")

Now we'd like to generate csv file for Tableau visualization for daily matches statistics.

df_PUBG %>%
  distinct(match_id, date, match_mode, party_size, map) %>%
  write_csv(path = "tableau_pie_chart.csv")

And next we generate plot for DAU vs Daily Matches

df_dau <- df_PUBG %>%
    distinct(date, player_name) %>%
    group_by(date) %>%
    summarise(n = n())

df_dau_endremoved <- df_dau[-c(1, 83), ]

df_matches <- df_PUBG %>%
  distinct(date, match_id) %>%
  group_by(date) %>%
  summarise(n = n())

df_mathces_endremoved <- df_matches[-c(1, 83), ]

df_mathces_endremoved$type = "match"
df_dau_endremoved$type = "dau"

df_match_dau <- rbind(df_dau_endremoved, df_mathces_endremoved)

df_match_dau %>%
  ggplot(aes(x = date, y = n, color = type)) + geom_line(size = .8) + 
  labs(y = "Count", x = "", title = "Daily Active User (DAU) vs. Daily Mathces (2017-10-21 to 2018-01-09)") +
    annotate("text", x = ymd("2017-12-21"), y = 44000, label = "(2017-12-21, 44804)", size = 2.8, color = "grey50") 

ggsave(filename = "dau_matches_curve.png", height = 15, width = 22, units = "cm")

Plot the distribution for other variables

library(gridExtra)

p1 <- df_PUBG %>%
  ggplot(aes(x = player_kills)) + geom_freqpoly(binwidth = 1, color = 'dodgerblue', size = .8) + 
  geom_histogram(aes(fill = factor(party_size)), binwidth = 1, alpha = .5, show.legend = FALSE) + 
  labs(y = "")

p2 <- df_PUBG %>%
  ggplot(aes(x = player_dbno)) + geom_freqpoly(binwidth = 1, color = 'dodgerblue', size = .8) + 
  geom_histogram(aes(fill = factor(party_size)), binwidth = 1, alpha = .5, show.legend = FALSE) + 
  labs(y = "")

p3 <- df_PUBG %>%
  ggplot(aes(x = player_assists)) + geom_freqpoly(binwidth = 1, color = 'dodgerblue', size = .8) + 
  geom_histogram(aes(fill = factor(party_size)), binwidth = 1, alpha = .5, show.legend = FALSE) + 
  labs(y = "")

p4 <- df_PUBG %>%
  ggplot(aes(x = player_dmg)) + geom_freqpoly(binwidth = 100, color = 'dodgerblue', size = .8) + 
  geom_histogram(aes(fill = factor(party_size)), binwidth = 100, alpha = .5, show.legend = FALSE) + 
  labs(y = "")

p5 <- df_PUBG %>%
  ggplot(aes(x = log10(player_dist_ride+1))) + geom_freqpoly(binwidth = .1, color = 'dodgerblue', size = .8) + 
  geom_histogram(aes(fill = factor(party_size)), binwidth = .1, alpha = .5, show.legend = FALSE) + 
  labs(y = "")

p6 <- df_PUBG %>%
  ggplot(aes(x = log10(player_dist_walk+1))) + geom_freqpoly(binwidth = .1, color = 'dodgerblue', size = .8) + 
  geom_histogram(aes(fill = factor(party_size)), binwidth = .1, alpha = .5) + 
  labs(y = "") + scale_fill_discrete("Party Size")

p7 <- df_PUBG %>%
  ggplot(aes(x = log10(player_survive_time+1))) + geom_freqpoly(binwidth = .1, color = 'dodgerblue', size = .8) + 
  geom_histogram(aes(fill = factor(party_size)), binwidth = .1,  alpha = .5, show.legend = FALSE) + 
  labs(y = "")

p8 <- df_PUBG %>%
  ggplot(aes(x = log10(avg_shoot_range+1))) + geom_freqpoly(binwidth = .1, color = 'dodgerblue', size = .8) + 
  geom_histogram(aes(fill = factor(party_size)), binwidth = .1, alpha = .5, show.legend = FALSE) + 
  labs(y = "")

p9 <- df_PUBG %>%
  ggplot(aes(x = log10(max_shoot_range+1))) + geom_freqpoly(binwidth = .1, color = 'dodgerblue', size = .8) + 
  geom_histogram(aes(fill = factor(party_size)), binwidth = .1, alpha = .5, show.legend = FALSE) + 
  labs(y = "")

p10 <- df_PUBG %>%
  ggplot(aes(x = hour)) + geom_freqpoly(binwidth = 1, color = 'dodgerblue', size = .8) + 
  geom_histogram(aes(fill = factor(party_size)), binwidth = 1, alpha = .5, show.legend = FALSE) + 
  labs(y = "")

grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, nrow = 5) 

After variable inspection, I will replace all the records with player_survival_time > 2160 to 2160

df_PUBG$player_survive_time[df_PUBG$player_survive_time > 2160] = 2160

Draw correlation plot

library(corrplot)
# deselect unneeded columns and negate the team placement so that larger number means higher rank
corr_mat <- cor(df_PUBG %>% select(-c("date", "game_size", "match_id", "match_mode", "party_size", "player_name", "team_id", "map", "datetime", "hour", "dayofweek")) %>% transform(team_rank = -team_placement) %>% select(-team_placement))

corrplot(corr_mat, method="circle", tl.cex = .8)