Input Table
DROP TABLE ner_input_eng; create multiset table ner_input_eng( id INTEGER, txt VARCHAR(500) CHARACTER SET LATIN NOT CASESPECIFIC ); insert into ner_input_eng values (1, 'At end of August, the Janus Unconstrained fund held only 45 debt issues with 70 percent of its assets in U.S. government debt.'); insert into ner_input_eng values (2, 'One Treasury issue due June 2016 alone was worth 43 percent of the fund''s total assets.'); insert into ner_input_eng values (3, 'Most of the bonds have short durations, with the average maturity of just over three years, indicating a generally defensive posture.'); insert into ner_input_eng values (4, 'For Bill Gross, quitting Pimco''s $222 billion Total Return Fund to take over a $13 million fund at Janus Capital is like resigning the U.S. presidency to become city manager of Ashtabula, Ohio, population 18,800.'); insert into ner_input_eng values (5, 'Gross stunned the investing world on Friday with his abrupt departure from Pimco, the $2 trillion asset manager he co-founded in 1971 and where he had run the Total Return Fund, the world''s biggest bond fund, for more than 27 years.'); insert into ner_input_eng values (6, '[0-9]+');
Rules Table
DROP TABLE ner_rule; create multiset table ner_rule( type_ner VARCHAR(500) CHARACTER SET LATIN NOT CASESPECIFIC, regex VARCHAR(500) CHARACTER SET LATIN NOT CASESPECIFIC ); insert into ner_rule values ('email', '[\w\-]([\.\w])+[\w]+@([\w\-]+\.)+[a-zA-Z]{2,4}'); insert into ner_rule values ('Money', '\s\$[0-9]+\s'); insert into ner_rule values ('Digits', '\s[0-9]+\s'); insert into ner_rule values ('Name', '[A-Z][a-z]+\s+[A-Z][a-z]+');
Dict Table
DROP TABLE ner_dict; create multiset table ner_dict( type_ner VARCHAR(500) CHARACTER SET LATIN NOT CASESPECIFIC, dict VARCHAR(500) CHARACTER SET LATIN NOT CASESPECIFIC ); insert into ner_dict values('location', 'Arkansas'); insert into ner_dict values('location', 'Dublin'); insert into ner_dict values('MISC', ' average maturity'); insert into ner_dict values('location', 'Ohio '); insert into ner_dict values('month', ' June '); insert into ner_dict values('Last Name', ' Gross'); insert into ner_dict values('digit regex', '[0-9]+');
Query Statement
Input Table : ner_input_eng
Rules Table: ner_rule
Dict Table: ner_dict
SELECT id, entity, "type", "start", "end", context, approach FROM TD_NERExtractor( ON ner_input_eng as InputTable ON ner_rule as rules DIMENSION ON ner_dict as dict DIMENSION USING TextColumn('txt') InputLanguage('en') ShowContext(3) Accumulate('id') ) as dt order by id, "start";
Output:
id | entity | type | start | end | context | approach |
---|---|---|---|---|---|---|
1 | Janus Unconstrained | Name | 6 | 7 | of August, the Janus Unconstrained fund held only | RULE |
1 | 45 | Digits | 11 | 11 | fund held only 45 debt issues with | RULE |
1 | 70 | Digits | 15 | 15 | debt issues with 70 percent of its | RULE |
2 | One Treasury | Name | 1 | 2 | ... ... ... One Treasury issue due June | RULE |
2 | June | month | 5 | 5 | Treasury issue due June 2016 alone was | DICT |
2 | 2016 | Digits | 6 | 6 | issue due June 2016 alone was worth | RULE |
2 | 43 | Digits | 10 | 10 | alone was worth 43 percent of the | RULE |
3 | average maturity | MISC | 10 | 11 | durations, with the average maturity of just over | DICT |
4 | For Bill | Name | 1 | 2 | ... ... ... For Bill Gross, quitting Pimco's | RULE |
4 | Gross | Last Name | 3 | 3 | ... For Bill Gross, quitting Pimco's $222 | DICT |
4 | $222 | Money | 6 | 4 | Gross, quitting Pimco's $222 billion Total Return | RULE |
4 | Total Return | Name | 8 | 9 | Pimco's $222 billion Total Return Fund to take | RULE |
4 | $13 | Money | 15 | 15 | take over a $13 million fund at | RULE |
4 | Janus Capital | Name | 19 | 20 | million fund at Janus Capital is like resigning | RULE |
4 | Ohio | location | 33 | 33 | manager of Ashtabula, Ohio, population 18,800. ... | DICT |
5 | Gross | Last Name | 1 | 1 | ... ... ... Gross stunned the investing | DICT |
5 | $2 | Money | 15 | 15 | from Pimco, the $2 trillion asset manager | RULE |
5 | 1971 | Digits | 22 | 22 | he co-founded in 1971 and where he | RULE |
5 | Total Return | Name | 29 | 30 | had run the Total Return Fund, the world's | RULE |
5 | 27 | Digits | 40 | 40 | for more than 27 years. ... ... | RULE |
6 | [0-9]+ | digit regex | 1 | 1 | ... ... ... [0-9]+ ... ... ... | DICT |