Data Gathering
Data Gathering
CSV
Opening a local .CSV file
Import pandas as pd
Data_frame_name = pd.read_csv('file_name.csv')
import requests
from io import stringIo
url = “”
headers = {"user-Agent": "Mozi1la/5.0 (Macintosh; Intel Mac os x 10.14; rv:66.0)
Gecko/20100101 Firefox/66.0")
req =requests.get(url,headers=headers)data=stringIo(req.text)
pd.read_csv(data)
Sep paramiter
Skiprows/nrows paramiter
Data_frame_name = pd.read_csv('file_name.csv', skiprows =[1,5])
Data_frame_name = pd.read_csv('file_name.csv', nrows = number of rows want to
input like 100 200 )
Encoding paramiter
Data_frame_name = pd.read_csv('file_name.tsv', encoding = ‘’)
Skip bad lines
Data_frame_name = pd.read_csv('file_name.csv', error_bad_lines = False)
Def fun_name(var_name):
If name == “value”:
Return “new value”
Else:
Return name
SQL
I need to load sql file into mysql
Import pandas as pd
Import requeset
Response = Requests.get(api link)
Response.json()[‘json name like results’]
Pd.dataframe(Response.json()[‘results’])
Extract column from a lot of column
Pd.dataframe(Response.json()[‘results’]).[[‘id’,’title’,’---’,’---’]]
Df = Pd.dataframe(Response.json()[‘results’]).[[‘id’,’title’,’---’,’---’]]
Df = pd.dataframe()
For i in range(1, total number of page):
Temp_df = Pd.dataframe(Response.json()[‘results’]).[[‘id’,’title’,’---’,’---’]]
Df = Df.append(Temp_df, ignore_index = True)
Df.to_csv(‘file_name.csv’)
Rapid api csv file
Web Scraping
Import pandas as pd
Import requests
From bs4 import BeautifulSoup
webpage=requests.get('https://www.ambitionbox.com/list-of-companies?page=1').text
soup=BeautifulSoup(webpage,'lxml')
#print(soup.prettify())
for i in company:
try:
name.append(i.find('h2').text.strip())
except:
name.append(np.nan)
try:
rating.append(i.find('p',class_='rating').text.strip())
except:
rating.append(np.nan)
try:
reviews.append(i.find('a' , class_='review-count').text.strip())
except:
reviews.append(np.nan)
try:
ctype.append(i.find_all('p',class_='infoEntity')[0].text.strip())
except:
ctype.append(np.nan)
try:
hq.append(i.find_all('p',class_='infoEntity')[1].text.strip())
except:
hq.append(np.nan)
try:
how_old.append(i.find_all('p',class_='infoEntity')[2].text.strip())
except:
how_old.append(np.nan)
try:
no_of_employee.append(i.find_all('p',class_='infoEntity')[3].text.strip())
except:
no_of_employee.append(np.nan)
df=pd.DataFrame({'name':name,
'rating':rating,
'reviews':reviews,
'company_type':ctype,
'Head_Quarters':hq,
'Company_Age':how_old,
'No_of_Employee':no_of_employee,
})
final=final.append(df,ignore_index=True)