-
Create the database table using the function ta.Query().
ta.dropTable("text_input", schemaName = "public") ta.Query("create FACT TABLE public.text_input (doc_id int, text_data varchar(1000), category varchar(20)) distribute by hash(doc_id);") ta.Query("INSERT INTO public.text_input VALUES (1,'Tennis star Roger Federer was born on August 8, 1981, in Basel, Switzerland, to Swiss father Robert Federer and South African mother Lynette Du Rand','sports'), (2,'Federer took an interest in sports at an early age, playing tennis and soccer at the age of 8.','sports'),(3,'At age 14, Federer became the national junior champion in Switzerland.','sports'), (4,'Federer won the Wimbledon boys singles and doubles titles in 1998, and turned professional later that year.','sports'), (5,'In 2003, following a successful season on grass, Federer became the first Swiss man to win a Grand Slam title when he emerged victorious at Wimbledon.','sports'), (6,'A natural disaster is a major adverse event resulting from natural processes of the Earth. Examples include floods, volcanic eruptions, earthquakes, tsunamis, and other geologic processes.','natural disaster'),(7,'In a vulnerable area, however, such as San Francisco in 1906, an earthquake can have disastrous consequences and leave lasting damage, requiring years to repair.','natural disaster'), (8,'An earthquake is the result of a sudden release of energy in the Earth crust that creates seismic waves.','natural disaster'), (9,'Volcanoes can cause widespread destruction and consequent disaster in several ways.','natural disaster'),(10,'A flood is an overflow of water that submerges land.','natural disaster') ;")
-
Create a virtual data frame from the database table.
ta_data <- ta.data.frame("text_input", schemaName = "public")
-
Create function cleanText.
The function cleanText uses base R functions to remove numbers, punctuation, and non-ASCII characters, and converts all letters to lowercase
cleanText <- function(x) { x <- gsub(pattern='[[:punct:]]', replacement='', x, perl=T) x <- gsub(pattern='[[:digit:]]', replacement='', x, perl=T) x <- tolower(x) x <- iconv(x, to='ASCII', sub='') x }
-
Create the WordCount function.
WordCount <- function(df) { df[,1]<-as.character(df[,1]) x <- unlist(strsplit(df[,1], split='[ ]+')) v <- data.frame(w=as.character(x), c=rep(1, length(x))) aggregate(c~w, v, FUN=sum) }
-
Use aa.apply to run the cleanText function to clean the data and create the new database table "text_input_clean".
ta.dropTable("text_input_clean", schemaName = "my_schema") aa.apply(ta_data[,c('text_data','category')], MARGIN = 2, FUN = cleanText, out.format=list(type="tadf", columns=c("text","category"), table="text_input_clean", tableType="fact", partitionKey="category", schemaName="my_schema")) ta_data_clean <- ta.data.frame("text_input_clean", schemaName = "my_schema")
-
Use aa.apply to run the WordCount function on the cleaned data.
wc <- aa.apply(ta_data_clean, FUN = WordCount, out.format=list(columns=c("word","count")))
The first few rows of the output are:
word count 1 a 7 2 adverse 1 3 african 1 4 age 3 5 an 5 6 and 7 7 area 1 8 as 1 9 at 4 10 august 1 … …