import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load the 3 different datasets
dfSec = pd.read_csv("https://raw.githubusercontent.com/foorilla/infosec-jobs-com-salaries/main/salaries.csv")
dfAi = pd.read_csv("https://raw.githubusercontent.com/foorilla/ai-jobs-net-salaries/main/salaries.csv")
dfDevops = pd.read_csv("https://raw.githubusercontent.com/foorilla/devops-jobs-net-salaries/main/salaries.csv")

# Add a new "type" column, used to identify the original dataset
dfSec["dataset_type"] = "SECURITY"
dfAi["dataset_type"] = "AI"
dfDevops["dataset_type"] = "DEVOPS"

# Merge data into a single big dataframe
df = pd.concat([dfSec, dfAi, dfDevops], join='outer', axis=0)


# Convert the company location field from ISO 3166-2 (two letters) to ISO 3166-1 (three letters)
# This is needed to be able to use the "locations" map
dfCountryCodes = pd.read_csv("datasets/wikipedia-iso-country-codes.csv")[["country_name", "Alpha-2 code", "Alpha-3 code"]]

# Merge on the Alpha-2 code but keep only the Alpha-3 code
df = df.merge(dfCountryCodes, left_on="company_location", right_on="Alpha-2 code", how="left")
df = df.drop(columns=["Alpha-2 code"])

# Rename the Alpha-3 code column to "company_location_alpha3"
df = df.rename(columns={"Alpha-3 code": "company_location_alpha3"})
df


# Rename experience level to something more readable
df["experience_level"] = df["experience_level"].replace({"EN": "Junior", "MI": "Intermediate", "SE": "Senior", "EX": "Director"})

# Rename remote ratio to something more readable
df["remote_ratio"] = df["remote_ratio"].replace({0: "< 20% remote", 50: "Partial remote", 100: "> 80% remote"})


df.dtypes

work_year                   int64
experience_level           object
employment_type            object
job_title                  object
salary                      int64
salary_currency            object
salary_in_usd               int64
employee_residence         object
remote_ratio               object
company_location           object
company_size               object
dataset_type               object
country_name               object
company_location_alpha3    object
dtype: object


# Validating manually data with known values
print("Experience level: ", df["experience_level"].unique())
print("Employment type: ", df["employment_type"].unique())
print("Company size: ", df["employment_type"].unique())
print("Dataset type: ", df["dataset_type"].unique())
print("Remote ratio:" , df["remote_ratio"].unique())
print("Salary currency:", df["salary_currency"].unique())
print("Employee residence:", df["employee_residence"].unique())
print("Company location:", df["company_location"].unique())
print("Company location:", df["company_size"].unique())

Experience level:  ['Senior' 'Intermediate' 'Junior' 'Director']
Employment type:  ['FT' 'PT' 'CT' 'FL']
Company size:  ['FT' 'PT' 'CT' 'FL']
Dataset type:  ['SECURITY' 'AI' 'DEVOPS']
Remote ratio: ['Partial remote' '> 80% remote' '< 20% remote']
Salary currency: ['USD' 'EUR' 'INR' 'GBP' 'AUD' 'CAD' 'SEK' 'BRL' 'CHF' 'DKK' 'SGD' 'MXN'
 'ILS' 'PLN' 'NOK' 'IDR' 'NZD' 'HUF' 'ZAR' 'TWD' 'CZK' 'CNY' 'JPY' 'TRY'
 'CLP' 'HKD' 'RUB' 'PHP']
Employee residence: ['US' 'IN' 'ES' 'DE' 'CA' 'FR' 'GB' 'CZ' 'AU' 'FI' 'IT' 'PT' 'IE' 'PY'
 'GR' 'SE' 'CY' 'BA' 'BR' 'BW' 'TW' 'NL' 'AE' 'CH' 'DK' 'CL' 'SG' 'PK'
 'GH' 'AZ' 'RO' 'AR' 'DZ' 'AF' 'SI' 'HR' 'HU' 'MX' 'IL' 'PL' 'CR' 'JP'
 'NG' 'NO' 'ID' 'EE' 'KG' 'KE' 'BG' 'IR' 'NZ' 'BE' 'ZA' 'AT' 'LU' 'EG'
 'SA' 'LT' 'RU' 'TR' 'AS' 'SK' 'CN' 'PR' 'TH' 'BO' 'PH' 'DO' 'MY' 'HN'
 'TN' 'VN' 'IQ' 'UA' 'JE' 'RS' 'CO' 'HK' 'MD' 'MT' 'LV' 'AM' 'ZW' 'UY'
 'AL' 'MK' 'GT' 'SV' 'EC']
Company location: ['US' 'IN' 'ES' 'DE' 'CA' 'FR' 'GB' 'CZ' 'AU' 'FI' 'IT' 'IE' 'PY' 'GR'
 'SE' 'CY' 'BA' 'BR' 'BW' 'SG' 'NL' 'AE' 'CH' 'DK' 'CL' 'JP' 'AZ' 'RO'
 'DZ' 'AQ' 'AX' 'SI' 'HR' 'HU' 'MX' 'IL' 'PK' 'NO' 'PL' 'PT' 'RU' 'ID'
 'EE' 'KE' 'RS' 'AR' 'NZ' 'BE' 'ZA' 'AT' 'UM' 'LU' 'EG' 'TW' 'SA' 'AF'
 'TR' 'CR' 'IR' 'BS' 'SK' 'NG' 'PR' 'AS' 'TH' 'BO' 'PH' 'MY' 'HN' 'IQ'
 'UA' 'CN' 'CO' 'MD' 'MT' 'LV' 'TN' 'HK' 'AM' 'ZW' 'KR' 'MK' 'SV' 'BG'
 'LT']
Company location: ['L' 'M' 'S']


# Now, check if numeric data is OK
print("N. invalid salaries:", (df["salary"] <= 0).sum())
print("N. invalid salaries in USD:", (df["salary_in_usd"] <= 0).sum())

N. invalid salaries: 0
N. invalid salaries in USD: 0


print("AI dataset time frame: ", dfAi["work_year"].min() , "-", dfAi["work_year"].max())
print("DevOps dataset time frame: ", dfDevops["work_year"].min() , "-", dfDevops["work_year"].max())
print("IT security dataset time frame: ", dfSec["work_year"].min() , "-", dfSec["work_year"].max())

AI dataset time frame:  2020 - 2022
DevOps dataset time frame:  2020 - 2022
IT security dataset time frame:  2020 - 2022


fig = px.pie(df, names="dataset_type", title='Amount of rows of each dataset')
fig.update_layout(width=500)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()


fig = px.histogram(df, x="work_year", color="dataset_type")
fig.update_xaxes(type='category', categoryorder='category ascending', title="Year")
fig.update_layout(legend_title_text='Job kind', width=1000)
fig.update_yaxes(title="Amount of rows")
fig.show()


fig = go.Figure(data=[go.Pie(labels=df["company_location"], insidetextorientation='radial')])
fig.update_layout(title="Amount of rows per company location")
fig.update_traces(textposition='inside', textinfo='percent+label', hoverinfo="value+label+percent")
fig.show()


copyDf = df.copy().loc[df["company_location"] == "US"]

# Show pie chart of amount of rows per employee residence
isUs = df["employee_residence"] == "US"
isForeigner = df["employee_residence"] != "US"

copyDf.loc[isUs, "employee_residence"] = "United States"
copyDf.loc[isForeigner, "employee_residence"] = "Others"

# Show pie char of amount of rows per employee residence + show subgroups of US and foreigner that are remote using a sunburst chart
px.sunburst(copyDf,
  path=["employee_residence", "remote_ratio"],
  color="remote_ratio",
  title="Employee residence and remote ratio distribution in US companies",
  labels={"remote_ratio": "Remote ratio", "employee_residence": "Employee residence"},
)


fig = px.box(df, x="salary_in_usd", y="dataset_type", color="dataset_type", title="US average salary in USD per job type", category_orders={"dataset_type": ["AI", "SECURITY", "DEVOPS"]})
fig.update_layout(legend_title_text='Job kind', width=1000, xaxis_title="Salary in USD", yaxis_title="Job kind", font=dict(size=17))
fig.show()


USData = df.loc[df["company_location"] == "US"].sort_values(by="salary_in_usd", ascending=False).head(11)

# Average salary per job title per job kind
fig = px.histogram(USData, x="salary_in_usd", y="job_title", color="dataset_type", title="Top 10 most paid job titles in the US", histfunc="avg")
fig.update_yaxes(type='category', categoryorder='mean ascending', title="Job title")
fig.update_xaxes(title="Average salary in USD")
fig.update_layout(legend_title_text='Job kind', width=1000, font=dict(size=14))
fig.show()


# Now, compute most common job titles per job kind
usJobTitles = df.loc[df["company_location"] == "US"].groupby("job_title")

# Now, compute most common job titles per job kind
job_entries_per_title = usJobTitles.size().sort_values(ascending=False).to_frame().rename(columns={0: "count"}).reset_index()
job_salaries_per_title = usJobTitles["salary_in_usd"].mean().sort_values(ascending=False).to_frame().reset_index()
job_dataset_type_per_title = usJobTitles["dataset_type"].first().to_frame().reset_index()

# Merge the two dataframes into one
job_entries_salaries_per_title = job_entries_per_title.merge(job_salaries_per_title, on="job_title", how="left").merge(job_dataset_type_per_title, on="job_title", how="left")

# Now, order by count and then by salary
job_entries_salaries_per_title = job_entries_salaries_per_title.sort_values(by=["count", "salary_in_usd"], ascending=False)

# Now, show the top 10 in a bar chart
fig = px.bar(job_entries_salaries_per_title.head(10), x="salary_in_usd", y="job_title", color="dataset_type", title="The 10 most common and well-paying jobs in the US", text="count")
fig.update_yaxes(type='category', categoryorder='mean ascending', title="Job title")
fig.update_xaxes(title="Salary in USD and number of entries taken into account")
fig.update_layout(legend_title_text='Job kind', width=1000, font=dict(size=14))
fig.show()


# Compute the average salary for each country
dfCountry = df.groupby("company_location_alpha3")["salary_in_usd"].mean().sort_values(ascending=False).reset_index()
# Add also company_location in alpha-2 format
dfCountry = dfCountry.merge(dfCountryCodes, left_on="company_location_alpha3", right_on="Alpha-3 code", how="left").drop(columns=["Alpha-3 code"])
fig = px.choropleth(
  dfCountry,
  locations="company_location_alpha3",
  hover_name="country_name",
  color_continuous_scale=px.colors.sequential.Bluyl,
  title="World average salary in USD per company location",
  color="salary_in_usd",
  locationmode="ISO-3",
  hover_data=["salary_in_usd", "country_name"]
)
fig.update_layout(width=1000, height=600, coloraxis_colorbar=dict(title="Salary in USD"), font=dict(size=14))
fig.show()


# Now, divide company location into 3 categories: US, Europe and Rest of the world
copyDf = df.copy()
isEurope = copyDf["company_location"].isin(["DE", "FR", "GB", "ES", "IT", "NL", "SE", "CH", "BE", "AT", "DK", "NO", "IE", "FI", "CZ", "PL", "PT", "GR", "HU", "RO", "SK", "SI", "BG", "HR", "LT", "LV", "EE", "CY", "LU", "MT", "IS", "LI", "AL", "MK", "BA", "MD", "ME", "RS", "TR", "UA", "BY", "XK"])
isUs = copyDf["company_location"] == "US"
isRestOfTheWorld = ~isEurope & ~isUs

copyDf.loc[isEurope, "company_location"] = "Europe"
copyDf.loc[isUs, "company_location"] = "United States"
copyDf.loc[isRestOfTheWorld, "company_location"] = "Rest of the world"

# Now, show an heatmap of the average salary in USD per job title type and experience level in the US and Europe
fig = px.density_heatmap(
  copyDf,
  z="salary_in_usd",
  x="experience_level",
  y="dataset_type",
  title="Average salary in USD per job kind and experience level in the US, EU and rest of the world",
  histfunc="avg",
  category_orders={"experience_level": ["Junior", "Intermediate", "Senior", "Director"]},
  facet_col="company_location",
  facet_col_wrap=3,
  facet_col_spacing=0.05,
  facet_row_spacing=0.05,
)
fig.update_layout(width=1000, coloraxis_colorbar=dict(title="Salary in USD"))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
# Print only the middle label
for axis in fig.layout:
   if type(fig.layout[axis]) == go.layout.XAxis:
        if axis == "xaxis2":
          fig.layout["xaxis2"].title="Experience level"
        else:
          fig.layout[axis].title=""
fig.update_yaxes(type='category', categoryorder='mean ascending', title_text="Job title", col=1, matches=None, showticklabels=True, visible=True)
fig.update_layout(font=dict(size=14))
fig.show()


# Rename the company size to make them more readable
copyDf = df.copy().loc[df["company_location"] == "US"]
copyDf["company_size"] = copyDf["company_size"].replace({"S": "< 50 employees", "M": "50 - 250 employees", "L": "> 250 employees"})

# World average salary in USD per company size
fig = px.box(
  copyDf,
  x="salary_in_usd",
  y="company_size",
  title="Average salary in USD per company size in the US", 
  category_orders={"company_size": ["< 50 employees", "50 - 250 employees", "> 250 employees"]}
)
fig.update_xaxes(title="Average salary in USD")
fig.update_yaxes(title="Company size")
fig.update_layout(width=1000, font=dict(size=15))
fig.show()


# Rename the remote ratio to make them more readable
copyDf = df.copy()

# Average salary in USD per remote ratio
fig = px.histogram(
  copyDf,
  x="salary_in_usd", 
  y="remote_ratio",
  color="dataset_type",
  title="Average salary in USD per remote ratio and job kind in the US",
  histfunc="avg",
  category_orders={"remote_ratio": ["< 20% remote", "Partial remote", "> 80% remote"]},
  barmode="group"
)
fig.update_layout(width=1000, legend_title_text='Job kind', font=dict(size=15))
fig.update_xaxes(title="Average salary in USD")
fig.update_yaxes(type='category', title="Remote ratio")
fig.show()


# Rename the employment types to make them more readable
copyDf = df.copy().loc[df["company_location"] == "US"]
copyDf["employment_type"] = copyDf["employment_type"].replace({"FT": "Full-time", "PT": "Part-time", "CT": "Contract", "FL": "Freelance"})

# World average salary in USD per employment type
fig = px.histogram(
  copyDf,
  x="salary_in_usd",
  y="employment_type",
  color="experience_level",
  title="Average salary in USD per employment type and experience level in the US",
  histfunc="avg",
  barmode="group",
  text_auto=True,
  category_orders={"experience_level": ["Junior", "Intermediate", "Senior", "Director"]},
  color_discrete_map={"Junior": "blue", "Intermediate": "green", "Senior": "orange", "Director": "red"}
)
fig.update_yaxes(type='category', categoryorder='mean ascending', title = "Employment type")
fig.update_xaxes(title = "Average salary in USD")
fig.update_layout(bargap=0.15, width=1100, height=600, legend_title_text='Experience level', font=dict(size=18))
fig.show()


copyDf = df.copy()
copyDf["job_title"] = copyDf.loc[copyDf["company_location"] == "US" ,"job_title"].str.split(" ").str[-1]

# Find all job titles present in all 3 datasets to have a fair comparison
selected_titles = copyDf[["job_title", "dataset_type"]].groupby(["job_title", "dataset_type"]).size().reset_index(name="count").groupby("job_title").filter(lambda x: len(x) == 3)["job_title"].unique().astype(str)

# Now, select only the job rank we are interested in
copyDf = copyDf.loc[copyDf["job_title"].isin(selected_titles)]

# Now, show an heatmap of the average salary in USD per job rank and experience level
fig = px.density_heatmap(
  copyDf,
  z="salary_in_usd",
  x="experience_level",
  y="job_title",
  title="Average salary in USD per job rank and experience level in the US",
  histfunc="avg",
  category_orders={"experience_level": ["Junior", "Intermediate", "Senior", "Director"]},
)
fig.update_yaxes(type='category', categoryorder='mean ascending')
fig.update_layout(xaxis_title="Experience level", yaxis_title="Job rank", width=1000, coloraxis_colorbar=dict(title="Salary in USD"))
fig.show()


# Extract job title types (last word)
copyDf = df.copy()
copyDf["job_title"] = copyDf.loc[copyDf["company_location"] == "US" ,"job_title"].str.split(" ").str[-1]

# Find all job titles present in all 3 datasets to have a fair comparison
selected_titles = copyDf[["job_title", "dataset_type"]].groupby(["job_title", "dataset_type"]).size().reset_index(name="count").groupby("job_title").filter(lambda x: len(x) == 3)["job_title"].unique().astype(str)

# Select only the job title types we are interested in
copyDf = copyDf.loc[copyDf["job_title"].isin(selected_titles)]

# Show an heatmap of the average salary in USD per job title type and experience level
fig = px.density_heatmap(
  copyDf,
  z="salary_in_usd",
  x="dataset_type",
  y="job_title",
  title="Average salary in USD per job rank and job kind in the US",
  histfunc="avg",
)

# If average is 0 then we should not show it
fig.update_yaxes(type='category', categoryorder='mean ascending')
fig.update_layout(
  xaxis_title="Job kind",
  yaxis_title="Job rank",
  legend_title_text="Average salary in USD",
  width=1000,
  coloraxis_colorbar=dict(title="Salary in USD")
)
fig.show()

IT jobs salary analysis¶

Introduction¶

Datasets used:¶

Datasets contents:¶

Libraries used:¶

Datasets columns¶

Loading datasets¶

Exploring the data¶

Data validation¶

Time frame of the data¶

Amount of data per dataset¶

Amount of data per year¶

Company location distribution¶

Employee residence and remote ratio distribution in US companies¶

Data analysis¶

Average annual salary in USD per job kind in the US¶

Top 10 most paid job titles in the US¶

The 10 most common and well-paying jobs in the US¶

World average salary in USD per company location¶

US vs Europe vs Rest of the world - Average salary in USD per experience level and job kind¶

Average salary in USD per company size in the US¶

Average salary in USD per remote ratio and job kind in the US¶

Average salary in USD per employment type and experience level in the US¶

Average salary in USD per job rank and experience level in the US¶

Average salary in USD per job rank and job kind in the US¶

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size	dataset_type	country_name	company_location_alpha3
0	2022	SE	FT	Staff Security Engineer	450000	USD	450000	US	50	US	L	SECURITY	United States	USA
1	2022	MI	FT	Application Security Engineer	18000	USD	18000	IN	50	IN	L	SECURITY	India	IND
2	2022	SE	FT	Penetration Testing Engineer	40000	EUR	42009	ES	100	ES	L	SECURITY	Spain	ESP
3	2022	SE	FT	DevOps Security Engineer	100000	EUR	105024	DE	100	DE	L	SECURITY	Germany	DEU
4	2022	SE	FT	Security Engineer	115000	USD	115000	US	0	US	M	SECURITY	United States	USA
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3870	2021	EN	FT	DevOps Engineer	100000	USD	100000	US	100	US	L	DEVOPS	United States	USA
3871	2021	SE	FT	DevOps Engineer	125000	CHF	136713	CH	0	CH	M	DEVOPS	Switzerland	CHE
3872	2020	SE	FT	CloudOps Database Administrator	100000	EUR	114047	NL	0	NL	L	DEVOPS	Netherlands	NLD
3873	2021	SE	FT	DevOps Architect	165000	USD	165000	US	100	US	L	DEVOPS	United States	USA
3874	2021	MI	FT	DevOps Engineer	55000	EUR	65013	FR	100	FR	S	DEVOPS	France	FRA