SlideShare a Scribd company logo
CL-NLP –
A Natural Language Processing
Library for Common Lisp
Vsevolod Dyomkin
Topics
* Motivation
* Library high-level design
* Overview of the functions
* Some technical details
* Next steps
Motivation
NLP & Lisp is a match, made
in heaven
Previous Work
* CL-Langutils
* Lexiparse, Sparser, CL-EARLY-
PARSER, Basic-English-Grammar
* Wordnet interfaces
* CMU AI Repository
* Natural Language Understanding
* Natural Language Processing in
Lisp
Essential Scope
* Corpora interfaces
* Language modeling
* Measures
* Tokenization
* Tagging
* Parsing
* Wordnet interface
* Construct a pipeline
TDD, BDD, CDD
An NLP Pipeline
“This is a test.” ->
TOP
|
S
/~~~~~~~~~~|~~~~~~~~
NP VP .
| /~~~~~~ |
DT VBZ NP .
| | /~~~~
This is DT NN
| |
a test
An NLP Pipeline
(print-parse-trees "This is a test.")
(defun print-parse-trees (text)
(dolist (sentence
(tokenize <sentence-tokenizer> text))
(pprint-tree (parse *pcfg*
(tag *hmm* text)))))
(defparameter *hmm*
(train 'hmm-tagger "treebank_dir/"))
(defparameter *pcfg*
(train 'pcfg "treebank_dir/"))
Some Design Choices
* A pseudo-hierarchical
package design
High-level Design
* cl-nlp & cl-nlp-contrib
& more
* base & functional packages
* nlp-user
Modules
* nlp.core, nlp.corpora,
nlp.util, nlp.test-util
* nlp.syntax, nlp.generation,
nlp.phonetics
* nlp.contrib.wordnet,
nlp.contrib.ms-ngrams
Some Design Choices
* Use the whole language
- defclass & defstruct
- hash-tables & alists
- do & loop
Some Design Choices
* Grow the language
- rutils
+ reader macros
+ shorthands
+ dotable, dolines
- special-purpose utils
General-Purpose Utils
(json:encode-json #{
"success" t
"sentence" text
"match"
(when-it (2nd (funcall fn (get-model text)))
(strcat
"{"
(strjoin ","
(mapcar #`(fmt ""~A":[~A,~A]"
(car %) (cadr %)
(cddr %))
(ht->alist it)))
"}"))
} stream)
A Special-Purpose Util
(define-lazy-singleton word-tokenizer
(make 'postprocessing-regex-word-tokenizer)
"Default word tokenizer.")
(defun tokenize-ngram (ngrams str)
"Transform string STR to a list if necessary
(depending of order of NGRAMS)."
(if (> (ngrams-order ngrams) 1)
(tokenize <word-tokenizer> str)
str))
A Special-Purpose Util
(defmacro define-lazy-singleton
(name init &optional docstring)
(with-gensyms (singleton)
`(let (,singleton)
(defun ,name ()
,docstring
(or ,singleton
(setf ,singleton ,init)))
(define-symbol-macro
,(mksym name :format "<~A>")
(,name)))))
Some Design Choices
* Use CLOS as a foundation
Basic Cell
(defclass regex-word-tokenizer (tokenizer)
((regex :accessor tokenizer-regex
:initarg :regex
:initform
(re:create-scanner
"w+|[!"#$%&'*+,./:;<=>?@^`~…()
{}[|] «»“”‘’¶-]"⟨⟩ ‒–—― )
:documentation
"A simpler variant would be [^s]+ —
it doesn't split punctuation, yet
sometimes it's desirable."))
(:documentation
"Regex-based word tokenizer."))
Basic Cell
(defmethod tokenize
((tokenizer regex-word-tokenizer) string)
(loop
:for (beg end)
:on (re:all-matches (tokenizer-regex
tokenizer)
string)
:by #'cddr
:collect (sub string beg end) :into words
:collect (cons beg end) :into spans
:finally (return (values words
spans)))
Another Example
(defgeneric parse (model sentence)
(:documentation
"Parse SENTENCE with MODEL.")
(:method :around (model (sentence string))
(call-next-method
model (tokenize <word-tokenizer> string))))
(defgeneric parse-n (model sentence n)
(:documentation
"Return N best parse trees of the SENTENCE
with MODEL.")
(:method :around (model (sentence string) n)
(call-next-method
model (tokenize <word-tokenizer> string) n)))
Parsing
(defmethod parse ((grammar pcfg) (sentence list))
(CKY (let* ((cur (cons rule (1- s)))
(l (@ pi0 (1- i) (1- s)
(second rule)))
(r (@ pi0 s (1- j) (third rule)))
(score (if (and l r)
(+ (log q) l r)
min)))
(when (> score (or max min))
(setf max score
arg cur)))))
Parsing
(defmethod parse :around ((grammar pcfg)
(sentence list))
(with-raw-results
(values (idx->nts (decode-parse-tree
sentence bps 0 last iroot))
(exp (or (@ pi0 0 last iroot) min))
pi0
bps)))
(macrolet
((CKY (&body body)
`(with-slots (rules nts->idx) grammar
(let* ((pi0 #{}) (bps #{})
(min most-negative-single-float))
;; init pi0 & bps (skipped)
(do ((pos 1 (1+ pos)))
((>= pos *sentence-length*))
(do ((i 1 (1+ i)))
((> i (- *sentence-length* pos)))
(let ((j (+ i pos)))
(dotable (_ k nts->idx)
(let (max arg)
(do ((s i (1+ s)))
((>= s j))
(dotable (rule q rules)
(when (and (tryadic rule)
(= k (first rule)))
,@body)))
(when (if (listp max) max (> max min))
(setf (@ pi0 (1- i) (1- j) k) max
(@ bps (1- i) (1- j) k) arg)))))))
(values pi0 bps)))))
(declaim (inline @))
(defun @ (m i j k)
(get# (+ (* i *sentence-length* *nt-count*)
(* j *nt-count*)
k)
m))
(defsetf @ (m i j k) (v)
`(set# (+ (* ,i *sentence-length* *nt-count*)
(* ,j *nt-count*)
,k)
,m ,v))
http://lisp-univ-etc.blogspot.com/search/label/nltk
Ad

More Related Content

What's hot (20)

Getting groovy (ODP)
Getting groovy (ODP)Getting groovy (ODP)
Getting groovy (ODP)
Nick Dixon
 
The Rust Borrow Checker
The Rust Borrow CheckerThe Rust Borrow Checker
The Rust Borrow Checker
Nell Shamrell-Harrington
 
Austin Bingham. Python Refactoring. PyCon Belarus
Austin Bingham. Python Refactoring. PyCon BelarusAustin Bingham. Python Refactoring. PyCon Belarus
Austin Bingham. Python Refactoring. PyCon Belarus
Alina Dolgikh
 
On UnQLite
On UnQLiteOn UnQLite
On UnQLite
charsbar
 
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Zabbix
 
Modern javascript localization with c-3po and the good old gettext
Modern javascript localization with c-3po and the good old gettextModern javascript localization with c-3po and the good old gettext
Modern javascript localization with c-3po and the good old gettext
Alexander Mostovenko
 
Os Welton
Os WeltonOs Welton
Os Welton
oscon2007
 
Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout
source{d}
 
Writing and using php streams and sockets
Writing and using php streams and socketsWriting and using php streams and sockets
Writing and using php streams and sockets
Elizabeth Smith
 
"Развитие ветки PHP-7"
"Развитие ветки PHP-7""Развитие ветки PHP-7"
"Развитие ветки PHP-7"
Badoo Development
 
Perl 5 & 6 regex complexity
Perl 5 & 6 regex complexityPerl 5 & 6 regex complexity
Perl 5 & 6 regex complexity
Radoslaw Kotowicz
 
The Ring programming language version 1.10 book - Part 49 of 212
The Ring programming language version 1.10 book - Part 49 of 212The Ring programming language version 1.10 book - Part 49 of 212
The Ring programming language version 1.10 book - Part 49 of 212
Mahmoud Samir Fayed
 
基于 Google protobuf 的 webgame 网络协议设计
基于 Google protobuf 的 webgame 网络协议设计基于 Google protobuf 的 webgame 网络协议设计
基于 Google protobuf 的 webgame 网络协议设计
勇浩 赖
 
The Ring programming language version 1.5.3 book - Part 39 of 184
The Ring programming language version 1.5.3 book - Part 39 of 184The Ring programming language version 1.5.3 book - Part 39 of 184
The Ring programming language version 1.5.3 book - Part 39 of 184
Mahmoud Samir Fayed
 
Is boilerplate code really so bad?
Is boilerplate code really so bad?Is boilerplate code really so bad?
Is boilerplate code really so bad?
Trisha Gee
 
JSON-RPC Proxy Generation with PHP 5
JSON-RPC Proxy Generation with PHP 5JSON-RPC Proxy Generation with PHP 5
JSON-RPC Proxy Generation with PHP 5
Stephan Schmidt
 
Gradle in a Polyglot World
Gradle in a Polyglot WorldGradle in a Polyglot World
Gradle in a Polyglot World
Schalk Cronjé
 
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Ontico
 
WordPress Performance Tuning
WordPress Performance TuningWordPress Performance Tuning
WordPress Performance Tuning
Javier Arturo Rodríguez
 
Introduction to Groovy (Serbian Developer Conference 2013)
Introduction to Groovy (Serbian Developer Conference 2013)Introduction to Groovy (Serbian Developer Conference 2013)
Introduction to Groovy (Serbian Developer Conference 2013)
Joachim Baumann
 
Getting groovy (ODP)
Getting groovy (ODP)Getting groovy (ODP)
Getting groovy (ODP)
Nick Dixon
 
Austin Bingham. Python Refactoring. PyCon Belarus
Austin Bingham. Python Refactoring. PyCon BelarusAustin Bingham. Python Refactoring. PyCon Belarus
Austin Bingham. Python Refactoring. PyCon Belarus
Alina Dolgikh
 
On UnQLite
On UnQLiteOn UnQLite
On UnQLite
charsbar
 
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Zabbix
 
Modern javascript localization with c-3po and the good old gettext
Modern javascript localization with c-3po and the good old gettextModern javascript localization with c-3po and the good old gettext
Modern javascript localization with c-3po and the good old gettext
Alexander Mostovenko
 
Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout
source{d}
 
Writing and using php streams and sockets
Writing and using php streams and socketsWriting and using php streams and sockets
Writing and using php streams and sockets
Elizabeth Smith
 
"Развитие ветки PHP-7"
"Развитие ветки PHP-7""Развитие ветки PHP-7"
"Развитие ветки PHP-7"
Badoo Development
 
The Ring programming language version 1.10 book - Part 49 of 212
The Ring programming language version 1.10 book - Part 49 of 212The Ring programming language version 1.10 book - Part 49 of 212
The Ring programming language version 1.10 book - Part 49 of 212
Mahmoud Samir Fayed
 
基于 Google protobuf 的 webgame 网络协议设计
基于 Google protobuf 的 webgame 网络协议设计基于 Google protobuf 的 webgame 网络协议设计
基于 Google protobuf 的 webgame 网络协议设计
勇浩 赖
 
The Ring programming language version 1.5.3 book - Part 39 of 184
The Ring programming language version 1.5.3 book - Part 39 of 184The Ring programming language version 1.5.3 book - Part 39 of 184
The Ring programming language version 1.5.3 book - Part 39 of 184
Mahmoud Samir Fayed
 
Is boilerplate code really so bad?
Is boilerplate code really so bad?Is boilerplate code really so bad?
Is boilerplate code really so bad?
Trisha Gee
 
JSON-RPC Proxy Generation with PHP 5
JSON-RPC Proxy Generation with PHP 5JSON-RPC Proxy Generation with PHP 5
JSON-RPC Proxy Generation with PHP 5
Stephan Schmidt
 
Gradle in a Polyglot World
Gradle in a Polyglot WorldGradle in a Polyglot World
Gradle in a Polyglot World
Schalk Cronjé
 
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Ontico
 
Introduction to Groovy (Serbian Developer Conference 2013)
Introduction to Groovy (Serbian Developer Conference 2013)Introduction to Groovy (Serbian Developer Conference 2013)
Introduction to Groovy (Serbian Developer Conference 2013)
Joachim Baumann
 

Viewers also liked (17)

Crash Course in Natural Language Processing (2016)
Crash Course in Natural Language Processing (2016)Crash Course in Natural Language Processing (2016)
Crash Course in Natural Language Processing (2016)
Vsevolod Dyomkin
 
Aspects of NLP Practice
Aspects of NLP PracticeAspects of NLP Practice
Aspects of NLP Practice
Vsevolod Dyomkin
 
Tedxkyiv communication guidelines
Tedxkyiv communication guidelinesTedxkyiv communication guidelines
Tedxkyiv communication guidelines
Vsevolod Dyomkin
 
Lisp Machine Prunciples
Lisp Machine PrunciplesLisp Machine Prunciples
Lisp Machine Prunciples
Vsevolod Dyomkin
 
Чему мы можем научиться у Lisp'а?
Чему мы можем научиться у Lisp'а?Чему мы можем научиться у Lisp'а?
Чему мы можем научиться у Lisp'а?
Vsevolod Dyomkin
 
Новые нереляционные системы хранения данных
Новые нереляционные системы хранения данныхНовые нереляционные системы хранения данных
Новые нереляционные системы хранения данных
Vsevolod Dyomkin
 
Lisp как универсальная обертка
Lisp как универсальная оберткаLisp как универсальная обертка
Lisp как универсальная обертка
Vsevolod Dyomkin
 
Can functional programming be liberated from static typing?
Can functional programming be liberated from static typing?Can functional programming be liberated from static typing?
Can functional programming be liberated from static typing?
Vsevolod Dyomkin
 
NLP in the WILD or Building a System for Text Language Identification
NLP in the WILD or Building a System for Text Language IdentificationNLP in the WILD or Building a System for Text Language Identification
NLP in the WILD or Building a System for Text Language Identification
Vsevolod Dyomkin
 
Practical NLP with Lisp
Practical NLP with LispPractical NLP with Lisp
Practical NLP with Lisp
Vsevolod Dyomkin
 
Lisp for Python Programmers
Lisp for Python ProgrammersLisp for Python Programmers
Lisp for Python Programmers
Vsevolod Dyomkin
 
Экосистема Common Lisp
Экосистема Common LispЭкосистема Common Lisp
Экосистема Common Lisp
Vsevolod Dyomkin
 
NLP Project Full Cycle
NLP Project Full CycleNLP Project Full Cycle
NLP Project Full Cycle
Vsevolod Dyomkin
 
Sugaring Lisp for the 21st Century
Sugaring Lisp for the 21st CenturySugaring Lisp for the 21st Century
Sugaring Lisp for the 21st Century
Vsevolod Dyomkin
 
Crash-course in Natural Language Processing
Crash-course in Natural Language ProcessingCrash-course in Natural Language Processing
Crash-course in Natural Language Processing
Vsevolod Dyomkin
 
Natural Language Processing in Practice
Natural Language Processing in PracticeNatural Language Processing in Practice
Natural Language Processing in Practice
Vsevolod Dyomkin
 
Natural language processing
Natural language processingNatural language processing
Natural language processing
Hansi Thenuwara
 
Crash Course in Natural Language Processing (2016)
Crash Course in Natural Language Processing (2016)Crash Course in Natural Language Processing (2016)
Crash Course in Natural Language Processing (2016)
Vsevolod Dyomkin
 
Tedxkyiv communication guidelines
Tedxkyiv communication guidelinesTedxkyiv communication guidelines
Tedxkyiv communication guidelines
Vsevolod Dyomkin
 
Чему мы можем научиться у Lisp'а?
Чему мы можем научиться у Lisp'а?Чему мы можем научиться у Lisp'а?
Чему мы можем научиться у Lisp'а?
Vsevolod Dyomkin
 
Новые нереляционные системы хранения данных
Новые нереляционные системы хранения данныхНовые нереляционные системы хранения данных
Новые нереляционные системы хранения данных
Vsevolod Dyomkin
 
Lisp как универсальная обертка
Lisp как универсальная оберткаLisp как универсальная обертка
Lisp как универсальная обертка
Vsevolod Dyomkin
 
Can functional programming be liberated from static typing?
Can functional programming be liberated from static typing?Can functional programming be liberated from static typing?
Can functional programming be liberated from static typing?
Vsevolod Dyomkin
 
NLP in the WILD or Building a System for Text Language Identification
NLP in the WILD or Building a System for Text Language IdentificationNLP in the WILD or Building a System for Text Language Identification
NLP in the WILD or Building a System for Text Language Identification
Vsevolod Dyomkin
 
Lisp for Python Programmers
Lisp for Python ProgrammersLisp for Python Programmers
Lisp for Python Programmers
Vsevolod Dyomkin
 
Экосистема Common Lisp
Экосистема Common LispЭкосистема Common Lisp
Экосистема Common Lisp
Vsevolod Dyomkin
 
Sugaring Lisp for the 21st Century
Sugaring Lisp for the 21st CenturySugaring Lisp for the 21st Century
Sugaring Lisp for the 21st Century
Vsevolod Dyomkin
 
Crash-course in Natural Language Processing
Crash-course in Natural Language ProcessingCrash-course in Natural Language Processing
Crash-course in Natural Language Processing
Vsevolod Dyomkin
 
Natural Language Processing in Practice
Natural Language Processing in PracticeNatural Language Processing in Practice
Natural Language Processing in Practice
Vsevolod Dyomkin
 
Natural language processing
Natural language processingNatural language processing
Natural language processing
Hansi Thenuwara
 
Ad

Similar to CL-NLP (20)

Enhancing Domain Specific Language Implementations Through Ontology
Enhancing Domain Specific Language Implementations Through OntologyEnhancing Domain Specific Language Implementations Through Ontology
Enhancing Domain Specific Language Implementations Through Ontology
Chunhua Liao
 
Perl at SkyCon'12
Perl at SkyCon'12Perl at SkyCon'12
Perl at SkyCon'12
Tim Bunce
 
ElasticSearch for .NET Developers
ElasticSearch for .NET DevelopersElasticSearch for .NET Developers
ElasticSearch for .NET Developers
Ben van Mol
 
Perl - laziness, impatience, hubris, and one liners
Perl - laziness, impatience, hubris, and one linersPerl - laziness, impatience, hubris, and one liners
Perl - laziness, impatience, hubris, and one liners
Kirk Kimmel
 
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
PROIDEA
 
Learning Puppet basic thing
Learning Puppet basic thing Learning Puppet basic thing
Learning Puppet basic thing
DaeHyung Lee
 
What we can learn from Rebol?
What we can learn from Rebol?What we can learn from Rebol?
What we can learn from Rebol?
lichtkind
 
JRuby e DSL
JRuby e DSLJRuby e DSL
JRuby e DSL
jodosha
 
APMG juni 2014 - Regular Expression
APMG juni 2014 - Regular ExpressionAPMG juni 2014 - Regular Expression
APMG juni 2014 - Regular Expression
Byte
 
Jsonsaga 100605143125-phpapp02
Jsonsaga 100605143125-phpapp02Jsonsaga 100605143125-phpapp02
Jsonsaga 100605143125-phpapp02
Ramamohan Chokkam
 
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj TalkSpark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Zalando Technology
 
Building DSLs On CLR and DLR (Microsoft.NET)
Building DSLs On CLR and DLR (Microsoft.NET)Building DSLs On CLR and DLR (Microsoft.NET)
Building DSLs On CLR and DLR (Microsoft.NET)
Vitaly Baum
 
Neo4j after 1 year in production
Neo4j after 1 year in productionNeo4j after 1 year in production
Neo4j after 1 year in production
Andrew Nikishaev
 
Meta Object Protocols
Meta Object ProtocolsMeta Object Protocols
Meta Object Protocols
Pierre de Lacaze
 
Scala Parser Combinators - Scalapeno Lightning Talk
Scala Parser Combinators - Scalapeno Lightning TalkScala Parser Combinators - Scalapeno Lightning Talk
Scala Parser Combinators - Scalapeno Lightning Talk
Lior Schejter
 
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Andrea Telatin
 
Apache Spark Workshop
Apache Spark WorkshopApache Spark Workshop
Apache Spark Workshop
Michael Spector
 
Redis: REmote DIctionary Server
Redis: REmote DIctionary ServerRedis: REmote DIctionary Server
Redis: REmote DIctionary Server
Ezra Zygmuntowicz
 
A Deep Dive Into Spark
A Deep Dive Into SparkA Deep Dive Into Spark
A Deep Dive Into Spark
Ashish kumar
 
DSLs for fun and profit by Jukka Välimaa
DSLs for fun and profit by Jukka VälimaaDSLs for fun and profit by Jukka Välimaa
DSLs for fun and profit by Jukka Välimaa
Montel Intergalactic
 
Enhancing Domain Specific Language Implementations Through Ontology
Enhancing Domain Specific Language Implementations Through OntologyEnhancing Domain Specific Language Implementations Through Ontology
Enhancing Domain Specific Language Implementations Through Ontology
Chunhua Liao
 
Perl at SkyCon'12
Perl at SkyCon'12Perl at SkyCon'12
Perl at SkyCon'12
Tim Bunce
 
ElasticSearch for .NET Developers
ElasticSearch for .NET DevelopersElasticSearch for .NET Developers
ElasticSearch for .NET Developers
Ben van Mol
 
Perl - laziness, impatience, hubris, and one liners
Perl - laziness, impatience, hubris, and one linersPerl - laziness, impatience, hubris, and one liners
Perl - laziness, impatience, hubris, and one liners
Kirk Kimmel
 
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
PROIDEA
 
Learning Puppet basic thing
Learning Puppet basic thing Learning Puppet basic thing
Learning Puppet basic thing
DaeHyung Lee
 
What we can learn from Rebol?
What we can learn from Rebol?What we can learn from Rebol?
What we can learn from Rebol?
lichtkind
 
JRuby e DSL
JRuby e DSLJRuby e DSL
JRuby e DSL
jodosha
 
APMG juni 2014 - Regular Expression
APMG juni 2014 - Regular ExpressionAPMG juni 2014 - Regular Expression
APMG juni 2014 - Regular Expression
Byte
 
Jsonsaga 100605143125-phpapp02
Jsonsaga 100605143125-phpapp02Jsonsaga 100605143125-phpapp02
Jsonsaga 100605143125-phpapp02
Ramamohan Chokkam
 
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj TalkSpark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Zalando Technology
 
Building DSLs On CLR and DLR (Microsoft.NET)
Building DSLs On CLR and DLR (Microsoft.NET)Building DSLs On CLR and DLR (Microsoft.NET)
Building DSLs On CLR and DLR (Microsoft.NET)
Vitaly Baum
 
Neo4j after 1 year in production
Neo4j after 1 year in productionNeo4j after 1 year in production
Neo4j after 1 year in production
Andrew Nikishaev
 
Scala Parser Combinators - Scalapeno Lightning Talk
Scala Parser Combinators - Scalapeno Lightning TalkScala Parser Combinators - Scalapeno Lightning Talk
Scala Parser Combinators - Scalapeno Lightning Talk
Lior Schejter
 
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Andrea Telatin
 
Redis: REmote DIctionary Server
Redis: REmote DIctionary ServerRedis: REmote DIctionary Server
Redis: REmote DIctionary Server
Ezra Zygmuntowicz
 
A Deep Dive Into Spark
A Deep Dive Into SparkA Deep Dive Into Spark
A Deep Dive Into Spark
Ashish kumar
 
DSLs for fun and profit by Jukka Välimaa
DSLs for fun and profit by Jukka VälimaaDSLs for fun and profit by Jukka Välimaa
DSLs for fun and profit by Jukka Välimaa
Montel Intergalactic
 
Ad

Recently uploaded (20)

Linux Professional Institute LPIC-1 Exam.pdf
Linux Professional Institute LPIC-1 Exam.pdfLinux Professional Institute LPIC-1 Exam.pdf
Linux Professional Institute LPIC-1 Exam.pdf
RHCSA Guru
 
Cybersecurity Identity and Access Solutions using Azure AD
Cybersecurity Identity and Access Solutions using Azure ADCybersecurity Identity and Access Solutions using Azure AD
Cybersecurity Identity and Access Solutions using Azure AD
VICTOR MAESTRE RAMIREZ
 
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul
 
Cyber Awareness overview for 2025 month of security
Cyber Awareness overview for 2025 month of securityCyber Awareness overview for 2025 month of security
Cyber Awareness overview for 2025 month of security
riccardosl1
 
Dev Dives: Automate and orchestrate your processes with UiPath Maestro
Dev Dives: Automate and orchestrate your processes with UiPath MaestroDev Dives: Automate and orchestrate your processes with UiPath Maestro
Dev Dives: Automate and orchestrate your processes with UiPath Maestro
UiPathCommunity
 
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-UmgebungenHCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
panagenda
 
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc
 
Mobile App Development Company in Saudi Arabia
Mobile App Development Company in Saudi ArabiaMobile App Development Company in Saudi Arabia
Mobile App Development Company in Saudi Arabia
Steve Jonas
 
IEDM 2024 Tutorial2_Advances in CMOS Technologies and Future Directions for C...
IEDM 2024 Tutorial2_Advances in CMOS Technologies and Future Directions for C...IEDM 2024 Tutorial2_Advances in CMOS Technologies and Future Directions for C...
IEDM 2024 Tutorial2_Advances in CMOS Technologies and Future Directions for C...
organizerofv
 
Greenhouse_Monitoring_Presentation.pptx.
Greenhouse_Monitoring_Presentation.pptx.Greenhouse_Monitoring_Presentation.pptx.
Greenhouse_Monitoring_Presentation.pptx.
hpbmnnxrvb
 
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
BookNet Canada
 
How analogue intelligence complements AI
How analogue intelligence complements AIHow analogue intelligence complements AI
How analogue intelligence complements AI
Paul Rowe
 
Build Your Own Copilot & Agents For Devs
Build Your Own Copilot & Agents For DevsBuild Your Own Copilot & Agents For Devs
Build Your Own Copilot & Agents For Devs
Brian McKeiver
 
Enhancing ICU Intelligence: How Our Functional Testing Enabled a Healthcare I...
Enhancing ICU Intelligence: How Our Functional Testing Enabled a Healthcare I...Enhancing ICU Intelligence: How Our Functional Testing Enabled a Healthcare I...
Enhancing ICU Intelligence: How Our Functional Testing Enabled a Healthcare I...
Impelsys Inc.
 
AI and Data Privacy in 2025: Global Trends
AI and Data Privacy in 2025: Global TrendsAI and Data Privacy in 2025: Global Trends
AI and Data Privacy in 2025: Global Trends
InData Labs
 
Big Data Analytics Quick Research Guide by Arthur Morgan
Big Data Analytics Quick Research Guide by Arthur MorganBig Data Analytics Quick Research Guide by Arthur Morgan
Big Data Analytics Quick Research Guide by Arthur Morgan
Arthur Morgan
 
Technology Trends in 2025: AI and Big Data Analytics
Technology Trends in 2025: AI and Big Data AnalyticsTechnology Trends in 2025: AI and Big Data Analytics
Technology Trends in 2025: AI and Big Data Analytics
InData Labs
 
The Evolution of Meme Coins A New Era for Digital Currency ppt.pdf
The Evolution of Meme Coins A New Era for Digital Currency ppt.pdfThe Evolution of Meme Coins A New Era for Digital Currency ppt.pdf
The Evolution of Meme Coins A New Era for Digital Currency ppt.pdf
Abi john
 
Complete Guide to Advanced Logistics Management Software in Riyadh.pdf
Complete Guide to Advanced Logistics Management Software in Riyadh.pdfComplete Guide to Advanced Logistics Management Software in Riyadh.pdf
Complete Guide to Advanced Logistics Management Software in Riyadh.pdf
Software Company
 
tecnologias de las primeras civilizaciones.pdf
tecnologias de las primeras civilizaciones.pdftecnologias de las primeras civilizaciones.pdf
tecnologias de las primeras civilizaciones.pdf
fjgm517
 
Linux Professional Institute LPIC-1 Exam.pdf
Linux Professional Institute LPIC-1 Exam.pdfLinux Professional Institute LPIC-1 Exam.pdf
Linux Professional Institute LPIC-1 Exam.pdf
RHCSA Guru
 
Cybersecurity Identity and Access Solutions using Azure AD
Cybersecurity Identity and Access Solutions using Azure ADCybersecurity Identity and Access Solutions using Azure AD
Cybersecurity Identity and Access Solutions using Azure AD
VICTOR MAESTRE RAMIREZ
 
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul
 
Cyber Awareness overview for 2025 month of security
Cyber Awareness overview for 2025 month of securityCyber Awareness overview for 2025 month of security
Cyber Awareness overview for 2025 month of security
riccardosl1
 
Dev Dives: Automate and orchestrate your processes with UiPath Maestro
Dev Dives: Automate and orchestrate your processes with UiPath MaestroDev Dives: Automate and orchestrate your processes with UiPath Maestro
Dev Dives: Automate and orchestrate your processes with UiPath Maestro
UiPathCommunity
 
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-UmgebungenHCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
panagenda
 
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc
 
Mobile App Development Company in Saudi Arabia
Mobile App Development Company in Saudi ArabiaMobile App Development Company in Saudi Arabia
Mobile App Development Company in Saudi Arabia
Steve Jonas
 
IEDM 2024 Tutorial2_Advances in CMOS Technologies and Future Directions for C...
IEDM 2024 Tutorial2_Advances in CMOS Technologies and Future Directions for C...IEDM 2024 Tutorial2_Advances in CMOS Technologies and Future Directions for C...
IEDM 2024 Tutorial2_Advances in CMOS Technologies and Future Directions for C...
organizerofv
 
Greenhouse_Monitoring_Presentation.pptx.
Greenhouse_Monitoring_Presentation.pptx.Greenhouse_Monitoring_Presentation.pptx.
Greenhouse_Monitoring_Presentation.pptx.
hpbmnnxrvb
 
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
BookNet Canada
 
How analogue intelligence complements AI
How analogue intelligence complements AIHow analogue intelligence complements AI
How analogue intelligence complements AI
Paul Rowe
 
Build Your Own Copilot & Agents For Devs
Build Your Own Copilot & Agents For DevsBuild Your Own Copilot & Agents For Devs
Build Your Own Copilot & Agents For Devs
Brian McKeiver
 
Enhancing ICU Intelligence: How Our Functional Testing Enabled a Healthcare I...
Enhancing ICU Intelligence: How Our Functional Testing Enabled a Healthcare I...Enhancing ICU Intelligence: How Our Functional Testing Enabled a Healthcare I...
Enhancing ICU Intelligence: How Our Functional Testing Enabled a Healthcare I...
Impelsys Inc.
 
AI and Data Privacy in 2025: Global Trends
AI and Data Privacy in 2025: Global TrendsAI and Data Privacy in 2025: Global Trends
AI and Data Privacy in 2025: Global Trends
InData Labs
 
Big Data Analytics Quick Research Guide by Arthur Morgan
Big Data Analytics Quick Research Guide by Arthur MorganBig Data Analytics Quick Research Guide by Arthur Morgan
Big Data Analytics Quick Research Guide by Arthur Morgan
Arthur Morgan
 
Technology Trends in 2025: AI and Big Data Analytics
Technology Trends in 2025: AI and Big Data AnalyticsTechnology Trends in 2025: AI and Big Data Analytics
Technology Trends in 2025: AI and Big Data Analytics
InData Labs
 
The Evolution of Meme Coins A New Era for Digital Currency ppt.pdf
The Evolution of Meme Coins A New Era for Digital Currency ppt.pdfThe Evolution of Meme Coins A New Era for Digital Currency ppt.pdf
The Evolution of Meme Coins A New Era for Digital Currency ppt.pdf
Abi john
 
Complete Guide to Advanced Logistics Management Software in Riyadh.pdf
Complete Guide to Advanced Logistics Management Software in Riyadh.pdfComplete Guide to Advanced Logistics Management Software in Riyadh.pdf
Complete Guide to Advanced Logistics Management Software in Riyadh.pdf
Software Company
 
tecnologias de las primeras civilizaciones.pdf
tecnologias de las primeras civilizaciones.pdftecnologias de las primeras civilizaciones.pdf
tecnologias de las primeras civilizaciones.pdf
fjgm517
 

CL-NLP

  • 1. CL-NLP – A Natural Language Processing Library for Common Lisp Vsevolod Dyomkin
  • 2. Topics * Motivation * Library high-level design * Overview of the functions * Some technical details * Next steps
  • 3. Motivation NLP & Lisp is a match, made in heaven
  • 4. Previous Work * CL-Langutils * Lexiparse, Sparser, CL-EARLY- PARSER, Basic-English-Grammar * Wordnet interfaces * CMU AI Repository * Natural Language Understanding * Natural Language Processing in Lisp
  • 5. Essential Scope * Corpora interfaces * Language modeling * Measures * Tokenization * Tagging * Parsing * Wordnet interface * Construct a pipeline
  • 7. An NLP Pipeline “This is a test.” -> TOP | S /~~~~~~~~~~|~~~~~~~~ NP VP . | /~~~~~~ | DT VBZ NP . | | /~~~~ This is DT NN | | a test
  • 8. An NLP Pipeline (print-parse-trees "This is a test.") (defun print-parse-trees (text) (dolist (sentence (tokenize <sentence-tokenizer> text)) (pprint-tree (parse *pcfg* (tag *hmm* text))))) (defparameter *hmm* (train 'hmm-tagger "treebank_dir/")) (defparameter *pcfg* (train 'pcfg "treebank_dir/"))
  • 9. Some Design Choices * A pseudo-hierarchical package design
  • 10. High-level Design * cl-nlp & cl-nlp-contrib & more * base & functional packages * nlp-user
  • 11. Modules * nlp.core, nlp.corpora, nlp.util, nlp.test-util * nlp.syntax, nlp.generation, nlp.phonetics * nlp.contrib.wordnet, nlp.contrib.ms-ngrams
  • 12. Some Design Choices * Use the whole language - defclass & defstruct - hash-tables & alists - do & loop
  • 13. Some Design Choices * Grow the language - rutils + reader macros + shorthands + dotable, dolines - special-purpose utils
  • 14. General-Purpose Utils (json:encode-json #{ "success" t "sentence" text "match" (when-it (2nd (funcall fn (get-model text))) (strcat "{" (strjoin "," (mapcar #`(fmt ""~A":[~A,~A]" (car %) (cadr %) (cddr %)) (ht->alist it))) "}")) } stream)
  • 15. A Special-Purpose Util (define-lazy-singleton word-tokenizer (make 'postprocessing-regex-word-tokenizer) "Default word tokenizer.") (defun tokenize-ngram (ngrams str) "Transform string STR to a list if necessary (depending of order of NGRAMS)." (if (> (ngrams-order ngrams) 1) (tokenize <word-tokenizer> str) str))
  • 16. A Special-Purpose Util (defmacro define-lazy-singleton (name init &optional docstring) (with-gensyms (singleton) `(let (,singleton) (defun ,name () ,docstring (or ,singleton (setf ,singleton ,init))) (define-symbol-macro ,(mksym name :format "<~A>") (,name)))))
  • 17. Some Design Choices * Use CLOS as a foundation
  • 18. Basic Cell (defclass regex-word-tokenizer (tokenizer) ((regex :accessor tokenizer-regex :initarg :regex :initform (re:create-scanner "w+|[!"#$%&'*+,./:;<=>?@^`~…() {}[|] «»“”‘’¶-]"⟨⟩ ‒–—― ) :documentation "A simpler variant would be [^s]+ — it doesn't split punctuation, yet sometimes it's desirable.")) (:documentation "Regex-based word tokenizer."))
  • 19. Basic Cell (defmethod tokenize ((tokenizer regex-word-tokenizer) string) (loop :for (beg end) :on (re:all-matches (tokenizer-regex tokenizer) string) :by #'cddr :collect (sub string beg end) :into words :collect (cons beg end) :into spans :finally (return (values words spans)))
  • 20. Another Example (defgeneric parse (model sentence) (:documentation "Parse SENTENCE with MODEL.") (:method :around (model (sentence string)) (call-next-method model (tokenize <word-tokenizer> string)))) (defgeneric parse-n (model sentence n) (:documentation "Return N best parse trees of the SENTENCE with MODEL.") (:method :around (model (sentence string) n) (call-next-method model (tokenize <word-tokenizer> string) n)))
  • 21. Parsing (defmethod parse ((grammar pcfg) (sentence list)) (CKY (let* ((cur (cons rule (1- s))) (l (@ pi0 (1- i) (1- s) (second rule))) (r (@ pi0 s (1- j) (third rule))) (score (if (and l r) (+ (log q) l r) min))) (when (> score (or max min)) (setf max score arg cur)))))
  • 22. Parsing (defmethod parse :around ((grammar pcfg) (sentence list)) (with-raw-results (values (idx->nts (decode-parse-tree sentence bps 0 last iroot)) (exp (or (@ pi0 0 last iroot) min)) pi0 bps)))
  • 23. (macrolet ((CKY (&body body) `(with-slots (rules nts->idx) grammar (let* ((pi0 #{}) (bps #{}) (min most-negative-single-float)) ;; init pi0 & bps (skipped) (do ((pos 1 (1+ pos))) ((>= pos *sentence-length*)) (do ((i 1 (1+ i))) ((> i (- *sentence-length* pos))) (let ((j (+ i pos))) (dotable (_ k nts->idx) (let (max arg) (do ((s i (1+ s))) ((>= s j)) (dotable (rule q rules) (when (and (tryadic rule) (= k (first rule))) ,@body))) (when (if (listp max) max (> max min)) (setf (@ pi0 (1- i) (1- j) k) max (@ bps (1- i) (1- j) k) arg))))))) (values pi0 bps)))))
  • 24. (declaim (inline @)) (defun @ (m i j k) (get# (+ (* i *sentence-length* *nt-count*) (* j *nt-count*) k) m)) (defsetf @ (m i j k) (v) `(set# (+ (* ,i *sentence-length* *nt-count*) (* ,j *nt-count*) ,k) ,m ,v))