!pip install psycopg2
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
#engine = create_engine('************')
engine = create_engine('postgresql://postgres:@localhost/books')
# Parse files
reading = pd.read_csv('reading.csv')
stories = pd.read_csv('stories.csv')
visits = pd.read_csv('visits.csv')
# Write into DB
reading.to_sql('reading', engine)
stories.to_sql('stories', engine)
visits.to_sql('visits', engine)
#/* reading for horror readers by day */
slq_query_1 =\
"""SELECT readings_horror.days, COUNT(DISTINCT readings_horror.visitor_id)
FROM (SELECT reading.visitor_id, to_char(reading.created_at,'YYYY-MM-DD') as days, reading.story_id, reading.visit_id
FROM (SELECT id
FROM stories
WHERE stories.category_one = 'horror' or stories.category_two = 'horror') AS stories_horror
INNER JOIN reading ON stories_horror.id = reading.story_id) AS readings_horror
GROUP BY readings_horror.days
ORDER BY readings_horror.days;"""
queried_data_1 = pd.read_sql(slq_query_1, engine)
#/* How much did they read? (Days) */
sql_query_2 =\
"""SELECT readings_horror.visitor_id, COUNT(DISTINCT readings_horror.days)
FROM (SELECT reading.visitor_id, to_char(reading.created_at,'YYYY-MM-DD') as days, reading.story_id, reading.visit_id
FROM (SELECT id
FROM stories
WHERE stories.category_one = 'horror' or stories.category_two = 'horror') AS stories_horror
INNER JOIN reading ON stories_horror.id = reading.story_id) AS readings_horror
GROUP BY readings_horror.visitor_id;"""
queried_data_2 = pd.read_sql(slq_query_2, engine)
#/* how many readers are there? */
sql_query_3 =\
"""SELECT COUNT(*)
FROM (SELECT DISTINCT reading_books_horror.visitor_id
FROM (SELECT readings_horror.created_at, readings_horror.story_id, readings_horror.visitor_id, visits.country
FROM (SELECT reading.visitor_id, reading.created_at, reading.story_id, reading.visit_id
FROM (SELECT id
FROM stories
WHERE stories.category_one = 'horror' or stories.category_two = 'horror') AS stories_horror
INNER JOIN reading ON stories_horror.id = reading.story_id) AS readings_horror
INNER JOIN visits ON readings_horror.visit_id=visits.visitor_id) AS reading_books_horror) AS visitors;"""
queried_data_3 = pd.read_sql(slq_query_3, engine)
#/* what country are the readers from? */
sql_query_4 =\
"""SELECT DISTINCT reading_books_horror.country
FROM (SELECT readings_horror.created_at, readings_horror.story_id, readings_horror.visitor_id, visits.country
FROM (SELECT reading.visitor_id, reading.created_at, reading.story_id, reading.visit_id
FROM (SELECT id
FROM stories
WHERE stories.category_one = 'horror' or stories.category_two = 'horror') AS stories_horror
INNER JOIN reading ON stories_horror.id = reading.story_id) AS readings_horror
INNER JOIN visits ON readings_horror.visit_id=visits.visitor_id) AS reading_books_horror;"""
queried_data_4 = pd.read_sql(slq_query_4, engine)
write an in Python a query that sums up reading for horror readers by day:
- how much did they read?
- how many readers are there?
- what country are the readers from?
import pandas as pd
# Parse files
reading = pd.read_csv('reading.csv')
stories = pd.read_csv('stories.csv')
visits = pd.read_csv('visits.csv')
# Obtain
stories_horror = stories.loc[((stories['category_one'] == 'horror') | (stories['category_two'] == 'horror')), ['id']]
reading_books = reading[['visitor_id', 'created_at', 'story_id', 'visit_id']].merge(stories_horror, left_on='story_id', right_on='id', how='inner', suffixes=('', '_y'))
reading_books = reading_books.merge(visits[['visitor_id', 'country']], left_on='visit_id', right_on='visitor_id', how='inner', suffixes=('', '_y'))
reading_books['created_at'] = pd.to_datetime(reading_books['created_at'], format='%Y-%m-%d').apply(lambda x: x.date())
reading_books = reading_books[['created_at', 'story_id', 'visitor_id', 'country']]
#reading_books
# readers by day
f_unique_readers = lambda x: len(x.unique())
#table = pd.pivot_table(reading_books, index='created_at', values='visitor_id', aggfunc=len).astype(int)
readers_by_days = pd.pivot_table(reading_books, index='created_at', values='visitor_id', aggfunc=f_unique_readers).astype(int)
readers_by_days_d = dict(zip([x.strftime("%Y-%m-%d") for x in readers_by_days.index], readers_by_days.as_matrix().squeeze()))
# how much did they read?
visitors_days_read = pd.pivot_table(reading_books, index='visitor_id', values='created_at', aggfunc=lambda x: len(x.unique())).astype(int)
#pd.pivot_table(reading_books, index='visitor_id', values='created_at', aggfunc=len).astype(int)
visitors_days_read_d = dict(zip(visitors_days_read.index, visitors_days_read.as_matrix().ravel()))
# how many readers are there?
len(reading_books['visitor_id'].unique())
# what country are the readers from?
sorted(reading_books['country'].unique())
the Stories table contains a field called 'Teaser'. How would you extract geographic location from this?
import pandas as pd
import nltk
# Parse files
stories = pd.read_csv('stories.csv')
In order to retrieve geographic location we need stored information of possible geographic locations we want to consider. It should be retrieved in O(1) time and that keys should be stored in format unique.
# Example of data stored of geospatial information
geo_info = {'london': {'type': 'city', 'summary': {'city': 'London', 'country': 'United Kingdom'}}}
# Text normalization and tokenize
def normalize_text(text):
# collection =\
# '! " # $ % & () * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~ \t \n'
# filter_tokens = collection.split(' ')
text = text.lower()
tokens = nltk.word_tokenize(text)
return tokens
## Retrieve geo information
geo_info_retrieved = []
notnull = ~ stories.teaser.isnull()
for i, teaser in enumerate(stories.teaser):
if notnull[i]:
geo_info_retrieved_e = {}
for s in normalize_text(teaser):
geo_info_retrieved_e.update(geo_info.get(s, {}))
geo_info_retrieved.append(geo_info_retrieved_e)
else:
geo_info_retrieved.append({})
geo_info_retrieved