Skip to content

December 14, 2012

This blog moved to: http://databasr.blogspot.com/

Mining Twitter for consumer attitudes towards hotels

March 9, 2012

Couple of months back I read Jeffrey Breen’s presentation on mining Twitter for consumer attitudes towards airlines, so I was just curious how it would look if I estimate the sentiment toward major hotels.

So here it is:

# load twitter library
> library(twitteR)
# search for all the hilton tweets
> hilton.tweets=searchTwitter('@hilton',n=1500)
> length(hilton.tweets)
[1] 162
> class(hilton.tweets)
[1] "list"
> tweet=hilton.tweets[[1]]
> class(tweet)
[1] "status"
attr(,"package")
[1] "twitteR"
> tweet$getScreenName()
[1] "i_RAHOFA"
> tweet$getText()
> library("plyr")
> hilton.text=laply(hilton.tweets,function(t)t$getText())
> length(hilton.text)
> head(hilton.text,5)
# load list of positive and negative words for SIMPLE sentiment analysis
# you would have to download the files from a website I included below - make sure you put in the directory that you will be
# referencing
> hu.liu.pos=scan('/Users/marcinkulakowski/Downloads/r/positive-words.txt',what='character',comment.char=';')
> hu.liu.neg=scan('/Users/marcinkulakowski/Downloads/r/negative-words.txt',what='character',comment.char=';')
> pos.words=c(hu.liu.pos,'upgrade')
> neg.words=c(hu.liu.neg,'wtf','wait','waiting','epicfail','mechanical')
# sampling
> sample=c("You'reawesomeandIloveyou","Ihateandhateandhate.Soangry.Die!","Impressedandamazed:youarepeerlessinyourachievementofunparalleledmediocrity.")
> score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)

# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array ("a") of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {

# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)

# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)

# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)

# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)

# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)

return(score)
}, pos.words, neg.words, .progress=.progress )

scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
> result=score.sentiment(sample,pos.words,neg.words)
> class(result)
[1] "data.frame"
> result$score
[1] 0 0 0
> hilton.scores=score.sentiment(hilton.text,pos.words,neg.words,.progress='text')
> hilton.scores$hotel='Hilton'
> hilton.scores$code='HL'
> hist(hilton.scores$score)
# hilton histogram
> library("ggplot2")
> qplot(hilton.scores$score)
# qplot hilton
# lets search for all other major hotels
# Intercontinental
intercontinental.tweets=searchTwitter('@intercontinental',n=1500)
class(tweet)
intercontinental.text=laply(intercontinental.tweets,function(t)t$getText())
intercontinental.scores=score.sentiment(intercontinental.text,pos.words,neg.words,.progress='text')
intercontinental.scores$hotel='Intercontinental'
intercontinental.scores$code='IC'
# Wyndham
wyndham.tweets=searchTwitter('@wyndham',n=1500)
class(tweet)
wyndham.text=laply(wyndham.tweets,function(t)t$getText())
wyndham.scores=score.sentiment(wyndham.text,pos.words,neg.words,.progress='text')
wyndham.scores$hotel='Wyndham'
wyndham.scores$code='WY'
# Marriott
marriott.tweets=searchTwitter('@marriott',n=1500)
class(tweet)
marriott.text=laply(marriott.tweets,function(t)t$getText())
marriott.scores=score.sentiment(marriott.text,pos.words,neg.words,.progress='text')
marriott.scores$hotel='Marriott'
marriott.scores$code='MI'
# BestWestern
bestwestern.tweets=searchTwitter('@bestwestern',n=1500)
class(tweet)
bestwestern.text=laply(bestwestern.tweets,function(t)t$getText())
bestwestern.scores=score.sentiment(bestwestern.text,pos.words,neg.words,.progress='text')
bestwestern.scores$hotel='Bestwestern'
bestwestern.scores$code='BW'
# Starwood
starwood.tweets=searchTwitter('@starwood',n=1500)
class(tweet)
starwood.text=laply(starwood.tweets,function(t)t$getText())
starwood.scores=score.sentiment(starwood.text,pos.words,neg.words,.progress='text')
starwood.scores$hotel='Starwood'
starwood.scores$code='SW'
# Hyatt
hyatt.tweets=searchTwitter('@hyatt',n=1500)
class(tweet)
hyatt.text=laply(hyatt.tweets,function(t)t$getText())
hyatt.scores=score.sentiment(hyatt.text,pos.words,neg.words,.progress='text')
hyatt.scores$hotel='Hyatt'
hyatt.scores$code='HY'
> all.scores=rbind(intercontinental.scores,wyndham.scores,hilton.scores,marriott.scores,bestwestern.scores,starwood.scores,hyatt.scores)
# Make separate plot for each hotel
> ggplot(data=all.scores)+#ggplotworksondata.frames,always
geom_bar(mapping=aes(x=score,fill=hotel),binwidth=1)+
facet_grid(hotel~.)+#makeaseparateplotforeachhotel
theme_bw()+scale_fill_brewer()#plaindisplay,nicercolors
# Plot
> all.scores$very.pos=as.numeric(all.scores$score>=2)
> all.scores$very.neg=as.numeric(all.scores$score twitter.df=ddply(all.scores,c('hotel','code'),summarise,pos.count=sum(very.pos),neg.count=sum(very.neg))
> twitter.df$all.count=twitter.df$pos.count+twitter.df$neg.count
> twitter.df$score=round(100*twitter.df$pos.count/twitter.df$all.count)
> install.packages("doBy")
> library("doBy")
> orderBy(~-score,twitter.df)
hotel code pos.count neg.count all.count score
1 Bestwestern BW 6 0 6 100
5 Starwood SW 7 0 7 100
6 Wyndham WY 2 0 2 100
3 Hyatt HY 7 1 8 88
2 Hilton HL 15 3 18 83
4 Marriott MI 13 4 17 76
> install.packages("XML")
> library(XML)
> acsi.url='http://www.theacsi.org/index.php?option=com_content&view=article&id=147&catid=&Itemid=212&i=Hotels'
# scrape acsi website for scores
> acsi.df=readHTMLTable(acsi.url,header=T,which=1,stringsAsFactors=F)
> acsi.df=acsi.df[,c(1,18)]
> head(acsi.df,1)
> colnames(acsi.df)=c('hotel','score')
> acsi.df$score=as.numeric(acsi.df$score)
> View(acsi.df)
# ACSI Dataframe
> acsi.df$code=c('HL','SW','MI','NA','HY','NA','IC','BW','NA','WY','NA','NA','NA')
> acsi.df$score=as.numeric(acsi.df$score)
> compare.df=merge(twitter.df,acsi.df,by='code',suffixes=c('.twitter','.acsi'))
> compare.df=subset(compare.df,all.count>100)
> compare.df=merge(twitter.df,acsi.df,by='code',suffixes=c('.twitter','.acsi'))
> View(compare.df)
# scores compared
> ggplot(compare.df)+geom_point(aes(x=score.twitter,y=score.acsi,color=hotel.twitter),size=6)+ geom_smooth(aes(x=score.twitter,y=score.acsi,group=1),se=F,method="lm")+theme_bw()+opts(legend.position=c(0.85,0.85))
# final plot

Materials used:
——————————————
Jeffrey Breen’s presentation

The American Customer Satisfaction Index (Hotels)

Opinion Mining, Sentiment Analysis, Opinion Extraction

56 years of FORTUNE’s 500 list of America’s largest corporations

January 17, 2012

The list was scraped from the web using R

Fortune500

Enjoy!

Machine Readable Data

November 17, 2011

How do we analyze news articles, blogs and tweets into actionable information that can be used in a matter of seconds?
        I have thought about this and I read couple of papers, solutions and I came up with this Logical Data Model that definitely can be used to convert news into machine readable data, assign numbers, sentiments about the specific topic related to news story and assign a score which would give any company or analyst quantifiable values that can make better and instant decisions.
        The design is robust so that we can have a list of words in any language, news stories of any type on any topic.

Here is how I would approach the analysis:
1.) Load the static tables such as WORD, TOPIC, NEWS TYPE, LANGUAGE, which are optional in the physical implementation world and WORD TOPIC which is very important for our analysis.
2.) Search Twitter, Newspapers etc. for text, stories, and blogs and load them up to NEWS STORY
3.) Sort the news by the Topic that we are trying to do analysis on.
4.) Create a scorecard to rank these stories in NEWS STORY TOPIC by looking up the WORDS’s sentiment in the WORD TOPIC
5.) Summarize for each topic
6.) Compare the story sentiment with some satisfaction or competition out there, or even measure that against product prices etc.
7.) Pinpoint the problems and draw conclusions

        The Natural Language Process would parse and categorize the text which produces metadata with any number of factors such as whether the story is important to this company or how relevant this is to this company etc.

        Now, I know it’s easier said than it’s done, but that’s how at least I would approach this process.
If anyone is interested in obtaining the DDL, just let me know and I will send it to you.

Make Everything Generic and Flexible?

October 27, 2011

Here are just two simple ways to make your database design more generic and flexible, which is not always good but in certain cases it makes sense and it pays to do in the long term.

First, is the preferences data, e.g. settings to your website, product, system, device etc. It should take care of any type of preferences/setting data and I have added some use cases based on iPhone device.

Preferences Data Model

Use case:

USER_ID: '90392847'
PRODUCT_ID: 'IPHONE'
CATEGORY_NAME: 'SETTINGS'
PREFERENCE_NAME: 'GENERAL'
DETAIL_NAME: 'NETWORK'
ATTRIBUTE_NAME: 'ENABLE 3G'         VALUE_TEXT: [Field name]
ATTRIBUTE_NAME: 'VALUE'             VALUE_TEXT: 'Y'
ATTRIBUTE_NAME: 'ATTRIBUTE STATE'   VALUE_TEXT: 'REQUIRED'
ATTRIBUTE_NAME: 'OPERATOR'         VALUE_TEXT: 'EQUALS'

USER_ID: '90392847'
PRODUCT_ID: 'IPHONE'
CATEGORY_NAME: 'SETTINGS'
PREFERENCE_NAME: 'GENERAL'
DETAIL_NAME: 'NETWORK'
ATTRIBUTE_NAME: 'CELLULAR DATA'      VALUE_TEXT: [Field name]
ATTRIBUTE_NAME: 'VALUE'             VALUE_TEXT: 'Y'
ATTRIBUTE_NAME: 'ATTRIBUTE STATE'    VALUE_TEXT: 'REQUIRED'
ATTRIBUTE_NAME: 'OPERATOR'         VALUE_TEXT: 'EQUALS'

Another useful design is whenever you have tons of reference type data such as list of country codes, state codes, status codes etc. it is good to keep them on the same table. Now, there are advantages ans disadvantages to that which I am describing below.

Reference Data Model

Code & Reference Tables
——————————————————————
Just because it is an entity on the logical data model…
- Does not always have to be a separate physical table
- Nor does it always require a physical RI relationship

“Not coded” information – store in base table anyway, forget code tables (Physical Database Implementation)
- Sometimes used for short lived data as a as permanent
- Where code description misspellings cannot occur

Code & Reference Tables – One option
——————————————————————
Combine code and reference tables into common groups
- Segmented tablespaces by characteristics
- Code “tables of tables” (See the diagram)
+ Do not cluster on primary index (spread data around)
+ Put index in separate buffer pool

Bottom Line
- Reduce number of objects in memory
- Use segmented tablespace based on like characteristics
- Use tables of tables where applicable
- Less tablespace and indexes to manage
- Less backup and recovery issues
- Less Maintenance issues

Code & Reference Tables – Placement/Access
——————————————————————
Load small code tables into memory tables (application)
- Faster than buffer pool for null values needed to be displayed
- Buffer pool access at 0.1ms to 0.4ms
- I/O to fill the buffer pools at 10ms+for I/O index and data

Simple Heatmap in R with Formula One Dataset

October 25, 2011

Now, that the 2011 F1 season is over I decided to quickly scrub the Formula 1 data of the F1.com website, such as the list of drivers, ordered by the approximate amount of salary driver is getting (top list driver is making the most, approx. 30MM) and position at the end of each race. There was a little bit of work coming up with this small dataset but I wanted to produce a heatmap type of graph to show the distinction between the drivers with respect to their salaries, plus its just couple of simple steps in R. One thing about this heatmap to notice is how consistent the driver is and who will move up the chain based on this season’s performance.

1.) You need R

2.) Dataset. I uploaded mine to DataCouch.com

3.) >library(“ggplot2″)

ggplot is an implementation of the grammar of graphics in R

4.) >F1_POS <- read.csv(“/Users/marcinkulakowski/R/F1_POS.csv”)

Load the data into F1_POS dataframe

5.) >F1_POS$Driver <- with(F1_POS, reorder(Driver, Salary))

The drivers are ordered by points, and the Salary variable converted to a factor for sorting.

6.) >F1_POS.m <- melt(F1_POS)

>F1_POS.m <- ddply(F1_POS.m, .(variable), transform, rescale = rescale(value))

Convert the data for easy casting and rescale the stats.

7.) >(p <- ggplot(F1_POS.m, aes(variable, Driver)) + geom_tile(aes(fill = rescale), colour = “white”) + scale_fill_gradient(low = “red”,high = “yellow”))

Plot the data.

8.) >base_size <- 9

      > p + theme_grey(base_size = base_size) + labs(x = “”, y = “”) + scale_x_discrete(expand = c(0, 0)) + scale_y_discrete(expand = c(0, 0)) + opts(legend.position = “none”, axis.ticks = theme_blank(), axis.text.x = theme_text(size = base_size * 0.8, angle = 330, hjust = 0, colour = “grey50″))

Source: Formula One

PDF Version

Follow

Get every new post delivered to your Inbox.