Open In Colab

My Solutions#

Loading Packages#



from google.colab import auth
auth.authenticate_user()
print('Authenticated')

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py in _input_request(self, prompt, ident, parent, password)
    728             try:
--> 729                 ident, reply = self.session.recv(self.stdin_socket, 0)
    730             except Exception:

/usr/local/lib/python3.7/dist-packages/jupyter_client/session.py in recv(self, socket, mode, content, copy)
    802         try:
--> 803             msg_list = socket.recv_multipart(mode, copy=copy)
    804         except zmq.ZMQError as e:

/usr/local/lib/python3.7/dist-packages/zmq/sugar/socket.py in recv_multipart(self, flags, copy, track)
    624         """
--> 625         parts = [self.recv(flags, copy=copy, track=track)]
    626         # have first part already, only loop while more to receive

zmq/backend/cython/socket.pyx in zmq.backend.cython.socket.Socket.recv()

zmq/backend/cython/socket.pyx in zmq.backend.cython.socket.Socket.recv()

zmq/backend/cython/socket.pyx in zmq.backend.cython.socket._recv_copy()

/usr/local/lib/python3.7/dist-packages/zmq/backend/cython/checkrc.pxd in zmq.backend.cython.checkrc._check_rc()

KeyboardInterrupt: 

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-2-0aebb55fab46> in <module>()
      2 
      3 from google.colab import auth
----> 4 auth.authenticate_user()
      5 print('Authenticated')
      6 

/usr/local/lib/python3.7/dist-packages/google/colab/auth.py in authenticate_user(clear_output)
    151     context_manager = _output.temporary if clear_output else _noop
    152     with context_manager():
--> 153       _gcloud_login()
    154     _install_adc()
    155     colab_tpu_addr = _os.environ.get('COLAB_TPU_ADDR', '')

/usr/local/lib/python3.7/dist-packages/google/colab/auth.py in _gcloud_login()
     93     # https://github.com/jupyter/notebook/issues/3159
     94     prompt = prompt.rstrip()
---> 95     code = get_code(prompt + ' ')
     96     gcloud_process.communicate(code.strip())
     97   finally:

/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py in raw_input(self, prompt)
    702             self._parent_ident,
    703             self._parent_header,
--> 704             password=False,
    705         )
    706 

/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py in _input_request(self, prompt, ident, parent, password)
    732             except KeyboardInterrupt:
    733                 # re-raise KeyboardInterrupt, to truncate traceback
--> 734                 raise KeyboardInterrupt
    735             else:
    736                 break

KeyboardInterrupt: 
import pandas as pa
import requests
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt

SQL Subquery#

%%bigquery --project pic-math
WITH t as (
SELECT COUNT(*) as number_trips, start_station_name
FROM `bigquery-public-data.austin_bikeshare.bikeshare_trips`
GROUP BY start_station_name
ORDER BY number_trips DESC
LIMIT 2
)

SELECT *
FROM t
ORDER BY number_trips
LIMIT 1
number_trips start_station_name
0 40635 Riverside @ S. Lamar
%%bigquery --project pic-math

SELECT COUNT(*) as round_trips, start_station_name
FROM `bigquery-public-data.austin_bikeshare.bikeshare_trips`
WHERE start_station_name = end_station_name AND duration_minutes >= 60
GROUP BY start_station_name
round_trips start_station_name
0 190 Toomey Rd @ South Lamar
1 119 Waller & 6th St.
2 249 State Capitol @ 14th & Colorado
3 83 Rainey @ River St
4 80 Nueces @ 3rd
... ... ...
173 317 Republic Square @ Guadalupe & 4th St.
174 378 3rd & West
175 169 3rd/West
176 71 26th/Nueces
177 86 Nueces & 26th

178 rows × 2 columns

SQL JOINS#



%%bigquery --project pic-math
WITH stations as (
SELECT name, property_type
FROM `bigquery-public-data.austin_bikeshare.bikeshare_stations`
)

SELECT stations.property_type as starting_station_type, 
       AVG(trips.duration_minutes) as average_ride_minutes, 
       count(*) as number_of_trips, 
       STDDEV_POP(trips.duration_minutes) as std_ride_minutes
FROM `bigquery-public-data.austin_bikeshare.bikeshare_trips` as trips 
    LEFT OUTER JOIN stations 
      ON trips.start_station_name = stations.name
WHERE stations.property_type = 'sidewalk' OR stations.property_type = 'parkland'
GROUP BY stations.property_type
ORDER BY average_ride_minutes DESC

##html Tables

html = requests.get('https://en.wikipedia.org/wiki/List_of_Marvel_Cinematic_Universe_films')
soup = BeautifulSoup(html.text)

df3 = pa.read_html(str(soup.find_all('table', class_='wikitable plainrowheaders')[0]))[0]

df3.columns = df3.columns.droplevel(1)
head = list(df3.columns[:-1])
head.append('Phase')
df3.columns = head
df3.Phase = [1,1,1,1,1,1,0,2,2,2,2,2,2,0,3,3,3,3,3,3,3,3,3,3,3]
df3.drop([6,13])
Film U.S. release date Director(s) Screenwriter(s) Producer(s) Phase
0 Iron Man May 2, 2008 Jon Favreau[26] Mark Fergus & Hawk Ostby and Art Marcum & Matt... Avi Arad and Kevin Feige 1
1 The Incredible Hulk June 13, 2008 Louis Leterrier[28] Zak Penn[29] Avi Arad, Gale Anne Hurdand Kevin Feige 1
2 Iron Man 2 May 7, 2010 Jon Favreau[30] Justin Theroux[31] Kevin Feige 1
3 Thor May 6, 2011 Kenneth Branagh[32] Ashley Edward Miller & Zack Stentz and Don Pay... Kevin Feige 1
4 Captain America: The First Avenger July 22, 2011 Joe Johnston[34] Christopher Markus & Stephen McFeely[35] Kevin Feige 1
5 Marvel's The Avengers May 4, 2012 Joss Whedon[36] Joss Whedon[36] Kevin Feige 1
7 Iron Man 3 May 3, 2013 Shane Black[37] Drew Pearce and Shane Black[37][38] Kevin Feige 2
8 Thor: The Dark World November 8, 2013 Alan Taylor[39] Christopher L. Yost and Christopher Markus & S... Kevin Feige 2
9 Captain America: The Winter Soldier April 4, 2014 Anthony and Joe Russo[41] Christopher Markus & Stephen McFeely[42] Kevin Feige 2
10 Guardians of the Galaxy August 1, 2014 James Gunn[43] James Gunn and Nicole Perlman[44] Kevin Feige 2
11 Avengers: Age of Ultron May 1, 2015 Joss Whedon[45] Joss Whedon[45] Kevin Feige 2
12 Ant-Man July 17, 2015 Peyton Reed[46] Edgar Wright & Joe Cornish and Adam McKay & Pa... Kevin Feige 2
14 Captain America: Civil War May 6, 2016 Anthony and Joe Russo[48] Christopher Markus & Stephen McFeely[48] Kevin Feige 3
15 Doctor Strange November 4, 2016 Scott Derrickson[49] Jon Spaihts and Scott Derrickson & C. Robert C... Kevin Feige 3
16 Guardians of the Galaxy Vol. 2 May 5, 2017 James Gunn[44] James Gunn[44] Kevin Feige 3
17 Spider-Man: Homecoming July 7, 2017 Jon Watts[51] Jonathan Goldstein & John Francis Daley andJon... Kevin Feige and Amy Pascal 3
18 Thor: Ragnarok November 3, 2017 Taika Waititi[53] Eric Pearson and Craig Kyle & Christopher L. Y... Kevin Feige 3
19 Black Panther February 16, 2018 Ryan Coogler[56] Ryan Coogler & Joe Robert Cole[57][58] Kevin Feige 3
20 Avengers: Infinity War April 27, 2018 Anthony and Joe Russo[59] Christopher Markus & Stephen McFeely[60] Kevin Feige 3
21 Ant-Man and the Wasp July 6, 2018 Peyton Reed[61] Chris McKenna & Erik Sommers andPaul Rudd & An... Kevin Feige and Stephen Broussard 3
22 Captain Marvel March 8, 2019 Anna Boden and Ryan Fleck[63] Anna Boden & Ryan Fleck & Geneva Robertson-Dwo... Kevin Feige 3
23 Avengers: Endgame April 26, 2019 Anthony and Joe Russo[59] Christopher Markus & Stephen McFeely[60] Kevin Feige 3
24 Spider-Man: Far From Home July 2, 2019 Jon Watts[65] Chris McKenna & Erik Sommers[66] Kevin Feige and Amy Pascal 3

html Selenium#

# RUN THIS CELL WHEN USING THE NOTEBOOK ON COLAB - NO PREVIOUS INSTALLATION OF SELENIUM IS NEEDED
# install chromium, its driver, and selenium
!apt update
!apt install chromium-chromedriver
!pip install selenium
# set options to be headless
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# open it, go to a website, and get results
driver = webdriver.Chrome('chromedriver',options=options)
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

0% [Working]
            
Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:14 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:15 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [2,544 kB]
Get:16 http://security.ubuntu.com/ubuntu bionic-security/restricted amd64 Packages [760 kB]
Get:17 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [1,466 kB]
Get:19 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Packages [914 kB]
Get:20 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [2,986 kB]
Get:21 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [2,244 kB]
Fetched 11.2 MB in 8s (1,484 kB/s)
Reading package lists... Done
Building dependency tree       
Reading state information... Done
51 packages can be upgraded. Run 'apt list --upgradable' to see them.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following packages were automatically installed and are no longer required:
  cuda-command-line-tools-10-0 cuda-command-line-tools-10-1
  cuda-command-line-tools-11-0 cuda-compiler-10-0 cuda-compiler-10-1
  cuda-compiler-11-0 cuda-cuobjdump-10-0 cuda-cuobjdump-10-1
  cuda-cuobjdump-11-0 cuda-cupti-10-0 cuda-cupti-10-1 cuda-cupti-11-0
  cuda-cupti-dev-11-0 cuda-documentation-10-0 cuda-documentation-10-1
  cuda-documentation-11-0 cuda-documentation-11-1 cuda-gdb-10-0 cuda-gdb-10-1
  cuda-gdb-11-0 cuda-gpu-library-advisor-10-0 cuda-gpu-library-advisor-10-1
  cuda-libraries-10-0 cuda-libraries-10-1 cuda-libraries-11-0
  cuda-memcheck-10-0 cuda-memcheck-10-1 cuda-memcheck-11-0 cuda-nsight-10-0
  cuda-nsight-10-1 cuda-nsight-11-0 cuda-nsight-11-1 cuda-nsight-compute-10-0
  cuda-nsight-compute-10-1 cuda-nsight-compute-11-0 cuda-nsight-compute-11-1
  cuda-nsight-systems-10-1 cuda-nsight-systems-11-0 cuda-nsight-systems-11-1
  cuda-nvcc-10-0 cuda-nvcc-10-1 cuda-nvcc-11-0 cuda-nvdisasm-10-0
  cuda-nvdisasm-10-1 cuda-nvdisasm-11-0 cuda-nvml-dev-10-0 cuda-nvml-dev-10-1
  cuda-nvml-dev-11-0 cuda-nvprof-10-0 cuda-nvprof-10-1 cuda-nvprof-11-0
  cuda-nvprune-10-0 cuda-nvprune-10-1 cuda-nvprune-11-0 cuda-nvtx-10-0
  cuda-nvtx-10-1 cuda-nvtx-11-0 cuda-nvvp-10-0 cuda-nvvp-10-1 cuda-nvvp-11-0
  cuda-nvvp-11-1 cuda-samples-10-0 cuda-samples-10-1 cuda-samples-11-0
  cuda-samples-11-1 cuda-sanitizer-11-0 cuda-sanitizer-api-10-1
  cuda-toolkit-10-0 cuda-toolkit-10-1 cuda-toolkit-11-0 cuda-toolkit-11-1
  cuda-tools-10-0 cuda-tools-10-1 cuda-tools-11-0 cuda-tools-11-1
  cuda-visual-tools-10-0 cuda-visual-tools-10-1 cuda-visual-tools-11-0
  cuda-visual-tools-11-1 default-jre dkms freeglut3 freeglut3-dev
  keyboard-configuration libargon2-0 libcap2 libcryptsetup12
  libdevmapper1.02.1 libfontenc1 libidn11 libip4tc0 libjansson4
  libnvidia-cfg1-510 libnvidia-common-460 libnvidia-common-510
  libnvidia-extra-510 libnvidia-fbc1-510 libnvidia-gl-510 libpam-systemd
  libpolkit-agent-1-0 libpolkit-backend-1-0 libpolkit-gobject-1-0 libxfont2
  libxi-dev libxkbfile1 libxmu-dev libxmu-headers libxnvctrl0
  nsight-compute-2020.2.1 nsight-compute-2022.1.0 nsight-systems-2020.3.2
  nsight-systems-2020.3.4 nsight-systems-2021.5.2 nvidia-dkms-510
  nvidia-kernel-common-510 nvidia-kernel-source-510 nvidia-modprobe
  nvidia-settings openjdk-11-jre policykit-1 policykit-1-gnome python3-xkit
  screen-resolution-extra systemd systemd-sysv udev x11-xkb-utils
  xserver-common xserver-xorg-core-hwe-18.04 xserver-xorg-video-nvidia-510
Use 'apt autoremove' to remove them.
The following additional packages will be installed:
  chromium-browser chromium-browser-l10n chromium-codecs-ffmpeg-extra
Suggested packages:
  webaccounts-chromium-extension unity-chromium-extension
The following NEW packages will be installed:
  chromium-browser chromium-browser-l10n chromium-chromedriver
  chromium-codecs-ffmpeg-extra
0 upgraded, 4 newly installed, 0 to remove and 51 not upgraded.
Need to get 95.3 MB of archives.
After this operation, 327 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 chromium-codecs-ffmpeg-extra amd64 97.0.4692.71-0ubuntu0.18.04.1 [1,142 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 chromium-browser amd64 97.0.4692.71-0ubuntu0.18.04.1 [84.7 MB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 chromium-browser-l10n all 97.0.4692.71-0ubuntu0.18.04.1 [4,370 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 chromium-chromedriver amd64 97.0.4692.71-0ubuntu0.18.04.1 [5,055 kB]
Fetched 95.3 MB in 5s (19.9 MB/s)
Selecting previously unselected package chromium-codecs-ffmpeg-extra.
(Reading database ... 155113 files and directories currently installed.)
Preparing to unpack .../chromium-codecs-ffmpeg-extra_97.0.4692.71-0ubuntu0.18.04.1_amd64.deb ...
Unpacking chromium-codecs-ffmpeg-extra (97.0.4692.71-0ubuntu0.18.04.1) ...
Selecting previously unselected package chromium-browser.
Preparing to unpack .../chromium-browser_97.0.4692.71-0ubuntu0.18.04.1_amd64.deb ...
Unpacking chromium-browser (97.0.4692.71-0ubuntu0.18.04.1) ...
Selecting previously unselected package chromium-browser-l10n.
Preparing to unpack .../chromium-browser-l10n_97.0.4692.71-0ubuntu0.18.04.1_all.deb ...
Unpacking chromium-browser-l10n (97.0.4692.71-0ubuntu0.18.04.1) ...
Selecting previously unselected package chromium-chromedriver.
Preparing to unpack .../chromium-chromedriver_97.0.4692.71-0ubuntu0.18.04.1_amd64.deb ...
Unpacking chromium-chromedriver (97.0.4692.71-0ubuntu0.18.04.1) ...
Setting up chromium-codecs-ffmpeg-extra (97.0.4692.71-0ubuntu0.18.04.1) ...
Setting up chromium-browser (97.0.4692.71-0ubuntu0.18.04.1) ...
update-alternatives: using /usr/bin/chromium-browser to provide /usr/bin/x-www-browser (x-www-browser) in auto mode
update-alternatives: using /usr/bin/chromium-browser to provide /usr/bin/gnome-www-browser (gnome-www-browser) in auto mode
Setting up chromium-chromedriver (97.0.4692.71-0ubuntu0.18.04.1) ...
Setting up chromium-browser-l10n (97.0.4692.71-0ubuntu0.18.04.1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
Processing triggers for hicolor-icon-theme (0.17-2) ...
Processing triggers for mime-support (3.60ubuntu1) ...
Processing triggers for libc-bin (2.27-3ubuntu1.3) ...
/sbin/ldconfig.real: /usr/local/lib/python3.7/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link

Collecting selenium
  Downloading selenium-4.1.0-py3-none-any.whl (958 kB)
     |████████████████████████████████| 958 kB 18.1 MB/s 
?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting urllib3[secure]~=1.26
  Downloading urllib3-1.26.8-py2.py3-none-any.whl (138 kB)
     |████████████████████████████████| 138 kB 51.3 MB/s 
?25hCollecting trio~=0.17
  Downloading trio-0.19.0-py3-none-any.whl (356 kB)
     |████████████████████████████████| 356 kB 44.2 MB/s 
?25hRequirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.7/dist-packages (from trio~=0.17->selenium) (21.4.0)
Collecting outcome
  Downloading outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Requirement already satisfied: idna in /usr/local/lib/python3.7/dist-packages (from trio~=0.17->selenium) (2.10)
Collecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Requirement already satisfied: sortedcontainers in /usr/local/lib/python3.7/dist-packages (from trio~=0.17->selenium) (2.4.0)
Collecting wsproto>=0.14
  Downloading wsproto-1.0.0-py3-none-any.whl (24 kB)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from urllib3[secure]~=1.26->selenium) (2021.10.8)
Collecting cryptography>=1.3.4
  Downloading cryptography-36.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (3.6 MB)
     |████████████████████████████████| 3.6 MB 50.1 MB/s 
?25hCollecting pyOpenSSL>=0.14
  Downloading pyOpenSSL-22.0.0-py2.py3-none-any.whl (55 kB)
     |████████████████████████████████| 55 kB 3.7 MB/s 
?25hRequirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.7/dist-packages (from cryptography>=1.3.4->urllib3[secure]~=1.26->selenium) (1.15.0)
Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.12->cryptography>=1.3.4->urllib3[secure]~=1.26->selenium) (2.21)
Collecting h11<1,>=0.9.0
  Downloading h11-0.13.0-py3-none-any.whl (58 kB)
     |████████████████████████████████| 58 kB 5.2 MB/s 
?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from h11<1,>=0.9.0->wsproto>=0.14->trio-websocket~=0.9->selenium) (3.10.0.2)
Installing collected packages: sniffio, outcome, h11, cryptography, async-generator, wsproto, urllib3, trio, pyOpenSSL, trio-websocket, selenium
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
requests 2.23.0 requires urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1, but you have urllib3 1.26.8 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
Successfully installed async-generator-1.10 cryptography-36.0.1 h11-0.13.0 outcome-1.1.0 pyOpenSSL-22.0.0 selenium-4.1.0 sniffio-1.2.0 trio-0.19.0 trio-websocket-0.9.2 urllib3-1.26.8 wsproto-1.0.0
url = 'https://google.com'
driver.get(url)


elem = driver.find_element(By.XPATH, '//input')

elem.send_keys("Tottenham Football Club")

#elem = driver.find_element(By.XPATH, '//input[@name = "btnI"]')
elem.send_keys(Keys.ENTER)
driver.current_url
'https://www.google.com/search?q=Tottenham+Football+Club&source=hp&ei=QAT7YZXeJYDQytMPw8Ws4AI&iflsig=AHkkrS4AAAAAYfsSUMHkfn7vB6MqyKlrFkHY89DtlTuv&ved=0ahUKEwiV_outh-L1AhUAqHIEHcMiCywQ4dUDCAk&uact=5&oq=Tottenham+Football+Club&gs_lcp=Cgdnd3Mtd2l6EANQAFhjYIQBaABwAHgAgAEAiAEAkgEAmAEAoAEB&sclient=gws-wiz'
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_source)
soup.title
<title>Tottenham Football Club - Google Search</title>

The implicit wait did nothing for me. The best option seemed to be to do the staleness_of command. Waiting is hard!

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import staleness_of

old_page = driver.find_element(By.XPATH,'//html')
driver.find_element(By.PARTIAL_LINK_TEXT,'twitter').click()

WebDriverWait(driver, 10).until(staleness_of(old_page))
#driver.implicitly_wait(5)

driver.title



driver.current_url
'https://twitter.com/SpursOfficial?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor'
driver.title
'Tottenham Hotspur (@SpursOfficial) / Twitter'
driver.current_url
'https://twitter.com/SpursOfficial?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor'

Strings#

line = 'happy birthday to you\n'

print(line.capitalize() +  line + line.replace("to you","dear {}".format(input("Enter the birthday celebrant's name: "))) + line)
Enter the birthday celebrant's name: Nick
Happy birthday to you
happy birthday to you
happy birthday dear Nick
happy birthday to you

String Cleaning#

import requests
import pandas as pa
from bs4 import BeautifulSoup


r = requests.get('https://en.wikipedia.org/wiki/List_of_highest_mountains_on_Earth')
html_contents = r.text
html_soup = BeautifulSoup(html_contents,"lxml")
tables = html_soup.find_all('table',class_="wikitable")

df1 = pa.read_html(str(tables))[0]
df1.columns = df1.columns.droplevel(0).droplevel(0)
df1.head()
cols = df1.columns.map(lambda s: re.sub(r"\[(.+)\]","",s))

cols
Index(['Rank', 'Mountain name(s)', 'm', 'ft', 'm', 'ft', 'Range',
       'Coordinates', 'Parent mountain', '1st', 'y', 'n',
       'Country (disputed claims in italics)'],
      dtype='object')
re.sub(r"\((.+)\)","",cols[1])
'Mountain name'
cols = cols.map(lambda s: re.sub(r"\((.+)\)","",s))

cols
Index(['Rank', 'Mountain name', 'm', 'ft', 'm', 'ft', 'Range', 'Coordinates',
       'Parent mountain', '1st', 'y', 'n', 'Country '],
      dtype='object')
re.sub(r" ","_",cols[1])
'Mountain_name'
cols = cols.map(lambda s: s.strip())
cols
Index(['Rank', 'Mountain name', 'm', 'ft', 'm', 'ft', 'Range', 'Coordinates',
       'Parent mountain', '1st', 'y', 'n', 'Country'],
      dtype='object')
cols = cols.map(lambda s: re.sub(r" ","_",s))

cols
Index(['Rank', 'Mountain_name', 'm', 'ft', 'm', 'ft', 'Range', 'Coordinates',
       'Parent_mountain', '1st', 'y', 'n', 'Country'],
      dtype='object')
cols = cols.map(lambda s : s.lower())

cols
Index(['rank', 'mountain_name', 'm', 'ft', 'm', 'ft', 'range', 'coordinates',
       'parent_mountain', '1st', 'y', 'n', 'country'],
      dtype='object')
df1.columns = cols
df1.head()
rank mountain_name m ft m ft range coordinates parent_mountain 1st y n country
0 1 .mw-parser-output ul.cslist,.mw-parser-output ... 8848 29,029[dp 7] 8848 29029 Mahalangur Himalaya .mw-parser-output .geo-default,.mw-parser-outp... 1953 145 121 NepalChina
1 2 K2 8611 28251 4020 13190 Baltoro Karakoram 35°52′53″N 76°30′48″E / 35.88139°N 76.51333°E Mount Everest 1954 45 44 Pakistan[dp 8]China[12]
2 3 Kangchenjunga 8586 28169 3922 12867 Kangchenjunga Himalaya 27°42′12″N 88°08′51″E / 27.70333°N 88.14750°E * Mount Everest 1955 38 24 NepalIndia
3 4 Lhotse 8516 27940 610 2000 Mahalangur Himalaya 27°57′42″N 86°55′59″E / 27.96167°N 86.93306°E Mount Everest 1956 26 26 NepalChina
4 5 Makalu 8485 27838 2378 7802 Mahalangur Himalaya 27°53′23″N 87°05′20″E / 27.88972°N 87.08889°E Mount Everest 1955 45 NepalChina

Strings and Regular Expressions Country Column Clean#

r = requests.get('https://en.wikipedia.org/wiki/List_of_highest_mountains_on_Earth')
html_contents = r.text
html_soup = BeautifulSoup(html_contents,"lxml")
tables = html_soup.find_all('table',class_="wikitable")

df1 = pa.read_html(str(tables))[0]
df1.columns = df1.columns.droplevel(0).droplevel(0)
df1.head()
df1.iloc[:,-1]
0                            NepalChina
1               Pakistan[dp 8]China[12]
2                            NepalIndia
3                            NepalChina
4                            NepalChina
                     ...               
115                               China
116                          NepalChina
117                  BhutanChina[dp 18]
118    IndiaChina[dp 10][dp 11]'[dp 12]
119                      Pakistan[dp 8]
Name: Country (disputed claims in italics), Length: 120, dtype: object
newcol = df1.iloc[:,-1]

newcol = newcol.apply(lambda x: re.sub(r"\[(.+?)\]","",x))

newcol
0         NepalChina
1      PakistanChina
2         NepalIndia
3         NepalChina
4         NepalChina
           ...      
115            China
116       NepalChina
117      BhutanChina
118      IndiaChina'
119         Pakistan
Name: Country (disputed claims in italics), Length: 120, dtype: object

I still see an unexpected character, I’ll remove that one too.

newcol = newcol.apply(lambda x: re.sub(r"[^A-z]","",x))

newcol
0         NepalChina
1      PakistanChina
2         NepalIndia
3         NepalChina
4         NepalChina
           ...      
115            China
116       NepalChina
117      BhutanChina
118       IndiaChina
119         Pakistan
Name: Country (disputed claims in italics), Length: 120, dtype: object
newcol = newcol.apply(lambda x: re.findall(r"[A-Z][a-z]*",x))
newcol
0         [Nepal, China]
1      [Pakistan, China]
2         [Nepal, India]
3         [Nepal, China]
4         [Nepal, China]
             ...        
115              [China]
116       [Nepal, China]
117      [Bhutan, China]
118       [India, China]
119           [Pakistan]
Name: Country (disputed claims in italics), Length: 120, dtype: object

I need to find the most number of countries meeting. I find the length of each and find the max of that.

max(newcol.apply(lambda x: len(x)))
3

I know there are 3 possible countries. I’ll make three columns with the possible answers

newcols = pa.DataFrame(newcol.to_list(), columns = ['country1','country2','country3'])

newcols
country1 country2 country3
0 Nepal China None
1 Pakistan China None
2 Nepal India None
3 Nepal China None
4 Nepal China None
... ... ... ...
115 China None None
116 Nepal China None
117 Bhutan China None
118 India China None
119 Pakistan None None

120 rows × 3 columns

I’ll add that back into the dataframe using concat on axis = 1

df1 = pa.concat([df1,newcols], axis = 1)

df1.head()
Rank[dp 1] Mountain name(s) m ft m ft Range Coordinates[dp 4] Parent mountain[dp 5] 1st y n Country (disputed claims in italics) country1 country2 country3
0 1 .mw-parser-output ul.cslist,.mw-parser-output ... 8848 29,029[dp 7] 8848 29029 Mahalangur Himalaya .mw-parser-output .geo-default,.mw-parser-outp... 1953 145 121 NepalChina Nepal China None
1 2 K2 8611 28251 4020 13190 Baltoro Karakoram 35°52′53″N 76°30′48″E / 35.88139°N 76.51333°E Mount Everest 1954 45 44 Pakistan[dp 8]China[12] Pakistan China None
2 3 Kangchenjunga 8586 28169 3922 12867 Kangchenjunga Himalaya 27°42′12″N 88°08′51″E / 27.70333°N 88.14750°E * Mount Everest 1955 38 24 NepalIndia Nepal India None
3 4 Lhotse 8516 27940 610 2000 Mahalangur Himalaya 27°57′42″N 86°55′59″E / 27.96167°N 86.93306°E Mount Everest 1956 26 26 NepalChina Nepal China None
4 5 Makalu 8485 27838 2378 7802 Mahalangur Himalaya 27°53′23″N 87°05′20″E / 27.88972°N 87.08889°E Mount Everest 1955 45 NepalChina Nepal China None

Dates My Solutions#

iot = pa.read_csv('https://raw.githubusercontent.com/nurfnick/Data_Viz/main/IOT-temp.csv')

iot.head()
id room_id/id noted_date temp out/in
0 __export__.temp_log_196134_bd201015 Room Admin 08-12-2018 09:30 29 In
1 __export__.temp_log_196131_7bca51bc Room Admin 08-12-2018 09:30 29 In
2 __export__.temp_log_196127_522915e3 Room Admin 08-12-2018 09:29 41 Out
3 __export__.temp_log_196128_be0919cf Room Admin 08-12-2018 09:29 41 Out
4 __export__.temp_log_196126_d30b72fb Room Admin 08-12-2018 09:29 31 In
iot.shape
(97606, 5)
iot.noted_date = pa.to_datetime(iot.noted_date,format = '%d-%m-%Y %H:%M')
iot.noted_date.max()
Timestamp('2018-12-08 09:30:00')
iot.noted_date.min()
Timestamp('2018-07-28 07:06:00')
(iot.noted_date.shift() - iot.noted_date).max()
Timedelta('11 days 13:38:00')
(iot.noted_date.shift() - iot.noted_date).idxmax()
14038

That is a bigger jump than expected! The device must have been turned off or stopped outputting for a bit. Let’s see where that happened.

iot.iloc[14030:14050,:]
id room_id/id noted_date temp out/in
14030 __export__.temp_log_102425_e0206705 Room Admin 2018-11-17 10:35:00 46 Out
14031 __export__.temp_log_142054_05d7f31e Room Admin 2018-11-17 10:31:00 45 Out
14032 __export__.temp_log_132828_a446ef24 Room Admin 2018-11-17 10:29:00 44 Out
14033 __export__.temp_log_130368_429948b8 Room Admin 2018-11-17 10:19:00 45 Out
14034 __export__.temp_log_129679_c7815c3a Room Admin 2018-11-17 10:17:00 46 Out
14035 __export__.temp_log_115558_f8f70efd Room Admin 2018-11-17 10:09:00 45 Out
14036 __export__.temp_log_134401_87b1348c Room Admin 2018-11-17 09:43:00 46 Out
14037 __export__.temp_log_134969_a8fe035c Room Admin 2018-11-17 09:41:00 45 Out
14038 __export__.temp_log_105931_d412d864 Room Admin 2018-11-05 20:03:00 41 Out
14039 __export__.temp_log_137841_5842365f Room Admin 2018-11-05 20:01:00 41 Out
14040 __export__.temp_log_116451_f584e59d Room Admin 2018-11-05 19:59:00 40 Out
14041 __export__.temp_log_90838_35fed2fd Room Admin 2018-11-05 19:57:00 41 Out
14042 __export__.temp_log_100510_81c084b2 Room Admin 2018-11-05 19:56:00 32 In
14043 __export__.temp_log_147095_21d94fe0 Room Admin 2018-11-05 19:53:00 41 Out
14044 __export__.temp_log_110222_030f5157 Room Admin 2018-11-05 19:51:00 41 Out
14045 __export__.temp_log_148651_5199c024 Room Admin 2018-11-05 19:47:00 42 Out
14046 __export__.temp_log_108041_1583682c Room Admin 2018-11-05 19:44:00 32 In
14047 __export__.temp_log_108105_9c42994b Room Admin 2018-11-05 19:43:00 41 Out
14048 __export__.temp_log_129004_bf8c5c0c Room Admin 2018-11-05 19:41:00 41 Out
14049 __export__.temp_log_111575_de593acd Room Admin 2018-11-05 19:39:00 41 Out
iot.noted_date.mean()
Timestamp('2018-10-07 05:10:38.821178880')
iot[iot.noted_date.dt.date == pa.Timestamp('09-11-2018')]
/usr/local/lib/python3.7/dist-packages/pandas/core/ops/array_ops.py:73: FutureWarning: Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior.  In a future version these will be considered non-comparable.Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.
  result = libops.scalar_compare(x.ravel(), y, op)
id room_id/id noted_date temp out/in
63867 __export__.temp_log_13951_c7fd4bf2 Room Admin 2018-09-11 23:59:00 28 Out
63868 __export__.temp_log_125783_87502329 Room Admin 2018-09-11 23:59:00 28 In
63869 __export__.temp_log_117810_921a0b1d Room Admin 2018-09-11 23:59:00 27 In
63870 __export__.temp_log_13950_419fb8ec Room Admin 2018-09-11 23:59:00 27 Out
63871 __export__.temp_log_13945_96e421ea Room Admin 2018-09-11 23:58:00 27 Out
... ... ... ... ... ...
73364 __export__.temp_log_118244_25e68d24 Room Admin 2018-09-11 07:38:00 33 Out
73365 __export__.temp_log_135028_912344cf Room Admin 2018-09-11 07:38:00 33 In
73366 __export__.temp_log_141419_d999d57a Room Admin 2018-09-11 07:38:00 32 Out
73367 __export__.temp_log_112252_eee53ad5 Room Admin 2018-09-11 07:38:00 32 Out
73368 __export__.temp_log_91587_c5eb7913 Room Admin 2018-09-11 07:38:00 33 Out

9502 rows × 5 columns

start = pa.Timestamp('09-11-2018')
end = pa.Timestamp('09-12-2018')


iot[iot.noted_date.between(start,end)]
id room_id/id noted_date temp out/in
63853 __export__.temp_log_113659_fad18b77 Room Admin 2018-09-12 00:00:00 27 Out
63854 __export__.temp_log_13965_2d758a5b Room Admin 2018-09-12 00:00:00 27 Out
63855 __export__.temp_log_148141_1ed4c048 Room Admin 2018-09-12 00:00:00 27 Out
63856 __export__.temp_log_13964_b6650f58 Room Admin 2018-09-12 00:00:00 27 Out
63857 __export__.temp_log_112729_9ad42af4 Room Admin 2018-09-12 00:00:00 27 Out
... ... ... ... ... ...
73364 __export__.temp_log_118244_25e68d24 Room Admin 2018-09-11 07:38:00 33 Out
73365 __export__.temp_log_135028_912344cf Room Admin 2018-09-11 07:38:00 33 In
73366 __export__.temp_log_141419_d999d57a Room Admin 2018-09-11 07:38:00 32 Out
73367 __export__.temp_log_112252_eee53ad5 Room Admin 2018-09-11 07:38:00 32 Out
73368 __export__.temp_log_91587_c5eb7913 Room Admin 2018-09-11 07:38:00 33 Out

9516 rows × 5 columns

I tried for awhile to get around the deprecation warning and this was the best I could come up with. It does give me the first few readings that happen exactly at midnight. I am okay with that…

iot[(iot.noted_date.between(start,end))& (iot['out/in'] == 'Out')].temp.mean()
30.051679232350924

Actually I can fix that by not allowing the the inclusion of the right endpoint.

iot[iot.noted_date.between(start,end, inclusive = 'left')]
id room_id/id noted_date temp out/in
63867 __export__.temp_log_13951_c7fd4bf2 Room Admin 2018-09-11 23:59:00 28 Out
63868 __export__.temp_log_125783_87502329 Room Admin 2018-09-11 23:59:00 28 In
63869 __export__.temp_log_117810_921a0b1d Room Admin 2018-09-11 23:59:00 27 In
63870 __export__.temp_log_13950_419fb8ec Room Admin 2018-09-11 23:59:00 27 Out
63871 __export__.temp_log_13945_96e421ea Room Admin 2018-09-11 23:58:00 27 Out
... ... ... ... ... ...
73364 __export__.temp_log_118244_25e68d24 Room Admin 2018-09-11 07:38:00 33 Out
73365 __export__.temp_log_135028_912344cf Room Admin 2018-09-11 07:38:00 33 In
73366 __export__.temp_log_141419_d999d57a Room Admin 2018-09-11 07:38:00 32 Out
73367 __export__.temp_log_112252_eee53ad5 Room Admin 2018-09-11 07:38:00 32 Out
73368 __export__.temp_log_91587_c5eb7913 Room Admin 2018-09-11 07:38:00 33 Out

9502 rows × 5 columns

iot[(iot.noted_date.between(start,end, inclusive = "left"))& (iot['out/in'] == 'Out')].temp.mean()
30.057547040241726

I was right not to worry about that shifting the average too much…

Integers and Floats#

df = pa.read_csv('https://raw.githubusercontent.com/nurfnick/Data_Viz/main/iris.csv')

df.head()
SepalLength SepalWidth PedalLength PedalWidth Class
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
df.SepalLength.mean()
5.843333333333335
df.SepalLength.astype('int').mean()
5.386666666666667
df.groupby('Class').agg('mean')
SepalLength SepalWidth PedalLength PedalWidth
Class
Iris-setosa 5.006 3.418 1.464 0.244
Iris-versicolor 5.936 2.770 4.260 1.326
Iris-virginica 6.588 2.974 5.552 2.026

Cannot just convert a groupby!

df.groupby('Class').astype('int').agg('mean')
df2 = df[['PedalLength','PedalWidth','SepalLength','SepalWidth']].astype('int')
df2 = pa.concat([df2,df.Class],axis = 1)

df2.head()
PedalLength PedalWidth SepalLength SepalWidth Class
0 1 0 5 3 Iris-setosa
1 1 0 4 3 Iris-setosa
2 1 0 4 3 Iris-setosa
3 1 0 4 3 Iris-setosa
4 1 0 5 3 Iris-setosa
df2.groupby('Class').agg('mean')
PedalLength PedalWidth SepalLength SepalWidth
Class
Iris-setosa 1.00 0.00 4.60 3.04
Iris-versicolor 3.82 1.00 5.48 2.32
Iris-virginica 5.10 1.58 6.08 2.58
df.groupby('Class').agg(['mean','median','count','std'])
SepalLength SepalWidth PedalLength PedalWidth
mean median count std mean median count std mean median count std mean median count std
Class
Iris-setosa 5.006 5.0 50 0.352490 3.418 3.4 50 0.381024 1.464 1.50 50 0.173511 0.244 0.2 50 0.107210
Iris-versicolor 5.936 5.9 50 0.516171 2.770 2.8 50 0.313798 4.260 4.35 50 0.469911 1.326 1.3 50 0.197753
Iris-virginica 6.588 6.5 50 0.635880 2.974 3.0 50 0.322497 5.552 5.55 50 0.551895 2.026 2.0 50 0.274650

Visualize Amounts#

import pandas as pa

df3 = pa.read_csv('https://raw.githubusercontent.com/nurfnick/Data_Viz/main/AB_NYC_2019.csv')
!pip install --upgrade matplotlib
Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (3.2.2)
Collecting matplotlib
  Downloading matplotlib-3.5.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)
     |████████████████████████████████| 11.2 MB 27.8 MB/s 
?25hRequirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib) (1.4.0)
Collecting fonttools>=4.22.0
  Downloading fonttools-4.31.2-py3-none-any.whl (899 kB)
     |████████████████████████████████| 899 kB 46.1 MB/s 
?25hRequirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.7/dist-packages (from matplotlib) (7.1.2)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.7/dist-packages (from matplotlib) (2.8.2)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from matplotlib) (1.21.5)
Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib) (3.0.7)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from matplotlib) (21.3)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib) (0.11.0)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib) (3.10.0.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7->matplotlib) (1.15.0)
Installing collected packages: fonttools, matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.2.2
    Uninstalling matplotlib-3.2.2:
      Successfully uninstalled matplotlib-3.2.2
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.
Successfully installed fonttools-4.31.2 matplotlib-3.5.1
ax = df3.groupby('neighbourhood_group').price.agg('max').plot.bar(ylim = [0,11100], 
                                                                  title = 'Max Price by Borough', 
                                                                  ylabel = 'Price in Dollars',
                                                                  rot = 45)

for container in ax.containers:
    ax.bar_label(container)

../../_images/70cddbc9432e908fdba096e72cc13ac1d0cbc516bd85ac26b97569789eff82b5.png


df3.groupby(['neighbourhood_group','room_type']).price.agg('max').plot.bar(y = 'room_type')

<matplotlib.axes._subplots.AxesSubplot at 0x7f0f5b778e90>
../../_images/c074a7b79e497907c550e58d9d8444a1b575d33e0f8ec37b804577a38f735c86.png

I had to do it with a pivot table to get around the double indexing of the groupby.

df_pivot = pa.pivot_table(
    df3, 
    values="price",
    index="neighbourhood_group",
    columns="room_type", 
    aggfunc=max
)

df_pivot.plot.bar(title = 'Grouped by Borough')
<matplotlib.axes._subplots.AxesSubplot at 0x7f0f5b7ec410>
../../_images/0ee56177e5e990169f54bf594183c6c41092da0ee3e7d29ec50551fcef61b9a5.png
df_pivot
room_type Entire home/apt Private room Shared room
neighbourhood_group
Bronx 1000 2500 800
Brooklyn 10000 7500 725
Manhattan 10000 9999 1000
Queens 2600 10000 1800
Staten Island 5000 300 150
df_pivot = pa.pivot_table(
    df3, 
    values="price",
    index="room_type",
    columns="neighbourhood_group", 
    aggfunc=max
)

df_pivot.plot.bar(title = 'Grouped by Room Type')
<matplotlib.axes._subplots.AxesSubplot at 0x7f0f5b12f310>
../../_images/c0a52e050e3923cd4c3ebe43ddbc4b2e6c615608f2328a2139b8427717b1f983.png

Visualize Histograms#

df3.price.plot.hist(title = "Price of Air B&B in NYC with Outliers Removed",bins = 100, xlim = [0,2000]).set_xlabel("Price")
Text(0.5, 0, 'Price')
../../_images/63766e4e5ff680d01bbf2caf980e881a2967689069ebeb557adac1d9384793ee.png
df3.price.plot.hist(title = "Price of Air B&B in NYC with Outliers",bins = 100)
<matplotlib.axes._subplots.AxesSubplot at 0x7f0f557fb210>
../../_images/4a9abc632f71eb07bdfde90f04a054e839fc77ee7c09e68705a18eee660acb9f.png
df3.price.plot.hist(title = "Price of Air B&B in NYC with Outliers",bins = 100, logx = True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f865ca97150>
../../_images/4a7726812d4efa440aa2e7dca22fd69c694c60af0c61f5ff9d195a9108b88d3e.png
df3.groupby('neighbourhood_group').price.plot.hist(alpha = .7, bins = 100, xlim = [0,2000], legend = True, title = "Distribution of Boroughs Pricing with Outliers Removed")
neighbourhood_group
Bronx            AxesSubplot(0.125,0.125;0.775x0.755)
Brooklyn         AxesSubplot(0.125,0.125;0.775x0.755)
Manhattan        AxesSubplot(0.125,0.125;0.775x0.755)
Queens           AxesSubplot(0.125,0.125;0.775x0.755)
Staten Island    AxesSubplot(0.125,0.125;0.775x0.755)
Name: price, dtype: object
../../_images/2034ea0075e00bca0bbd6b57048c01a1f71fb113b60415865226f44da40e78f2.png
df3.groupby('neighbourhood_group').price.plot.hist(alpha = .5, bins = 100, legend = True, title = "Distribution of Boroughs Pricing loglog Scale", logx = True, logy = True)

plt.show()
../../_images/28b1d7d81fe20a994fd9f60fb80f3c40094c37350befe1696dd8b58a15dd2dba.png

Visualize Proportions#

I’ll limit the number of colors passed to it to be just 7 instead of 10. The percent will have to be converted back to a total. I do that with a function. I also do this in two ways.

import pandas as pa
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pa.read_csv('https://raw.githubusercontent.com/nurfnick/Data_Viz/main/Activity_Dataset_V1.csv')

def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:.0f}%\n{:d}".format(pct, absolute)


plt.pie(x=df.groupby('workout_type').workout_type.agg('count'),
        labels = df.groupby('workout_type').workout_type.agg('count').index, 
        autopct=lambda pct: func(pct,df.groupby('workout_type').workout_type.agg('count')),
        colors = sns.color_palette('bright')[:7])
plt.show()
../../_images/cb338e0154e3dfbafb2442c3e2bfdf95454af2a30b725be363ae69b9a1f4e3da.png

If you just want the total;

def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:d}".format(absolute)


plt.pie(x=df.groupby('workout_type').workout_type.agg('count'),
        labels = df.groupby('workout_type').workout_type.agg('count').index, 
        autopct=lambda pct: func(pct,df.groupby('workout_type').workout_type.agg('count')),
        colors = sns.color_palette('colorblind')[:8])
plt.show()
../../_images/7a6345a9bf878a203c6be09ff13df9360ca31be85f94ec13e0584b25311ffc1d.png