Tasks related with basic nlp related operations

Task 1

  1. write an SQL query that sums up reading for horror readers by day.
  2. how much did they read?
  3. how many readers are there?
  4. what country are the readers from?
In [1]:
!pip install psycopg2
Requirement already satisfied: psycopg2 in /opt/conda/lib/python3.6/site-packages
In [2]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
In [3]:
#engine = create_engine('************')
engine = create_engine('postgresql://postgres:@localhost/books')

# Parse files
reading = pd.read_csv('reading.csv')
stories = pd.read_csv('stories.csv')
visits = pd.read_csv('visits.csv')
# Write into DB
reading.to_sql('reading', engine)
stories.to_sql('stories', engine)
visits.to_sql('visits', engine)
In [4]:
#/* reading for horror readers by day */
slq_query_1 =\
    """SELECT readings_horror.days, COUNT(DISTINCT readings_horror.visitor_id)
    FROM (SELECT reading.visitor_id, to_char(reading.created_at,'YYYY-MM-DD') as days, reading.story_id, reading.visit_id
          FROM (SELECT id
                  FROM stories
                  WHERE stories.category_one = 'horror' or stories.category_two = 'horror') AS stories_horror
          INNER JOIN reading ON stories_horror.id = reading.story_id) AS readings_horror
    GROUP BY readings_horror.days
    ORDER BY readings_horror.days;"""

queried_data_1 = pd.read_sql(slq_query_1, engine)
In [5]:
#/* How much did they read? (Days) */
sql_query_2 =\
    """SELECT readings_horror.visitor_id, COUNT(DISTINCT readings_horror.days)
    FROM (SELECT reading.visitor_id, to_char(reading.created_at,'YYYY-MM-DD') as days, reading.story_id, reading.visit_id
          FROM (SELECT id
                  FROM stories
                  WHERE stories.category_one = 'horror' or stories.category_two = 'horror') AS stories_horror
          INNER JOIN reading ON stories_horror.id = reading.story_id) AS readings_horror
    GROUP BY readings_horror.visitor_id;"""
    
queried_data_2 = pd.read_sql(slq_query_2, engine)
In [6]:
#/* how many readers are there? */
sql_query_3 =\
    """SELECT COUNT(*)
    FROM (SELECT DISTINCT reading_books_horror.visitor_id
          FROM (SELECT readings_horror.created_at, readings_horror.story_id, readings_horror.visitor_id, visits.country
                FROM (SELECT reading.visitor_id, reading.created_at, reading.story_id, reading.visit_id
                      FROM (SELECT id
                            FROM stories
                            WHERE stories.category_one = 'horror' or stories.category_two = 'horror') AS stories_horror
                      INNER JOIN reading ON stories_horror.id = reading.story_id) AS readings_horror
                INNER JOIN visits ON readings_horror.visit_id=visits.visitor_id) AS reading_books_horror) AS visitors;"""
    
queried_data_3 = pd.read_sql(slq_query_3, engine)
    
In [7]:
#/* what country are the readers from? */
sql_query_4 =\
    """SELECT DISTINCT reading_books_horror.country
    FROM (SELECT readings_horror.created_at, readings_horror.story_id, readings_horror.visitor_id, visits.country
          FROM (SELECT reading.visitor_id, reading.created_at, reading.story_id, reading.visit_id
                FROM (SELECT id
                      FROM stories
                      WHERE stories.category_one = 'horror' or stories.category_two = 'horror') AS stories_horror
                INNER JOIN reading ON stories_horror.id = reading.story_id) AS readings_horror
          INNER JOIN visits ON readings_horror.visit_id=visits.visitor_id) AS reading_books_horror;"""

queried_data_4 = pd.read_sql(slq_query_4, engine)

Task 2

write an in Python a query that sums up reading for horror readers by day:

- how much did they read?
- how many readers are there?
- what country are the readers from?
In [8]:
import pandas as pd
In [9]:
# Parse files
reading = pd.read_csv('reading.csv')
stories = pd.read_csv('stories.csv')
visits = pd.read_csv('visits.csv')
In [10]:
# Obtain 
stories_horror = stories.loc[((stories['category_one'] == 'horror') | (stories['category_two'] == 'horror')), ['id']]
reading_books = reading[['visitor_id', 'created_at', 'story_id', 'visit_id']].merge(stories_horror, left_on='story_id', right_on='id', how='inner', suffixes=('', '_y'))
reading_books = reading_books.merge(visits[['visitor_id', 'country']], left_on='visit_id', right_on='visitor_id', how='inner', suffixes=('', '_y'))
reading_books['created_at'] = pd.to_datetime(reading_books['created_at'], format='%Y-%m-%d').apply(lambda x: x.date())
reading_books = reading_books[['created_at', 'story_id', 'visitor_id', 'country']]
#reading_books
In [11]:
# readers by day
f_unique_readers = lambda x: len(x.unique())
#table = pd.pivot_table(reading_books, index='created_at', values='visitor_id', aggfunc=len).astype(int)
readers_by_days = pd.pivot_table(reading_books, index='created_at', values='visitor_id', aggfunc=f_unique_readers).astype(int)
readers_by_days_d = dict(zip([x.strftime("%Y-%m-%d") for x in readers_by_days.index], readers_by_days.as_matrix().squeeze()))
In [12]:
# how much did they read?
visitors_days_read = pd.pivot_table(reading_books, index='visitor_id', values='created_at', aggfunc=lambda x: len(x.unique())).astype(int)
#pd.pivot_table(reading_books, index='visitor_id', values='created_at', aggfunc=len).astype(int)
visitors_days_read_d = dict(zip(visitors_days_read.index, visitors_days_read.as_matrix().ravel()))
In [13]:
# how many readers are there?
len(reading_books['visitor_id'].unique())
Out[13]:
42
In [14]:
# what country are the readers from?
sorted(reading_books['country'].unique())
Out[14]:
['Australia',
 'Bangladesh',
 'Canada',
 'France',
 'Israel',
 'Philippines',
 'Romania',
 'United Kingdom',
 'United States']

Task 3

the Stories table contains a field called 'Teaser'. How would you extract geographic location from this?

In [15]:
import pandas as pd
import nltk
In [16]:
# Parse files
stories = pd.read_csv('stories.csv')

Retrieve Geographic location

In order to retrieve geographic location we need stored information of possible geographic locations we want to consider. It should be retrieved in O(1) time and that keys should be stored in format unique.

In [17]:
# Example of data stored of geospatial information
geo_info = {'london': {'type': 'city', 'summary': {'city': 'London', 'country': 'United Kingdom'}}}
In [18]:
# Text normalization and tokenize
def normalize_text(text):
#    collection =\
#        '! " # $ % & () * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~ \t \n'
#    filter_tokens = collection.split(' ')
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    return tokens
In [19]:
## Retrieve geo information
geo_info_retrieved = []
notnull = ~ stories.teaser.isnull()
for i, teaser in enumerate(stories.teaser):
    if notnull[i]:
        geo_info_retrieved_e = {}
        for s in normalize_text(teaser):
            geo_info_retrieved_e.update(geo_info.get(s, {}))
        geo_info_retrieved.append(geo_info_retrieved_e)
    else:
        geo_info_retrieved.append({})
In [20]:
geo_info_retrieved
Out[20]:
[{},
 {},
 {},
 {},
 {'summary': {'city': 'London', 'country': 'United Kingdom'}, 'type': 'city'},
 {}]