From 6a0a41825dec31e6fe1e3dba4c7e0ea7360b6617 Mon Sep 17 00:00:00 2001
From: Charles Plessy <https://launchpad.net/~plessy>
Date: Sun, 21 Dec 2014 14:30:18 +0900
Subject: [PATCH] Removed TODO and refreshed benchmark.

---
 Haskell/refSeqIdSymbol.html | 52 ++++++++++---------------
 Haskell/refSeqIdSymbol.lhs  | 76 +++++++++++++++++++------------------
 2 files changed, 58 insertions(+), 70 deletions(-)
diff --git a/Haskell/refSeqIdSymbol.html b/Haskell/refSeqIdSymbol.html
index d5a22772..9b1bf274 100644
--- a/Haskell/refSeqIdSymbol.html
+++ b/Haskell/refSeqIdSymbol.html
@@ -52,7 +52,6 @@ parseGbRecord r <span class="fu">=</span> <span class="kw">case</span> parse gbR
                           print e
             <span class="dt">Right</span> r <span class="ot">-&gt;</span>  putStrLn r</code></pre>
 <p>Parsec returns either an error message or the result of the parsing.</p>
-<p>TODO: how about printing always ? What is the difference between pring and putStrLn ?</p>
 <pre class="sourceCode literate haskell"><code class="sourceCode haskell">gbRecord <span class="fu">=</span> <span class="kw">do</span>
   fs <span class="ot">&lt;-</span> many field
   return <span class="fu">.</span> intercalate <span class="st">&quot;\t&quot;</span> <span class="fu">$</span> filter (<span class="fu">/=</span> <span class="st">&quot;&quot;</span>) fs</code></pre>
@@ -104,40 +103,27 @@ separator <span class="fu">=</span> newline <span class="fu">&gt;&gt;</span> not
 <li><p>cleans up the remaining double quotes.</p></li>
 </ul>
 <h2 id="speed">Speed</h2>
-<p>Unfortunately, the Haskell version is way too slow to process the full RefSeq data. Here is a comparison using a test file of only 476 Kib.</p>
+<p>The Haskell version is way too slow to process the full RefSeq data. Here is a comparison using a test file of only 100,000 lines (~4 megabytes).</p>
 <pre><code>$ ghc -O2 refSeqIdSymbol.lhs
 [1 of 1] Compiling Main             ( refSeqIdSymbol.lhs, refSeqIdSymbol.o )
 Linking refSeqIdSymbol ...
-choucaâ~â$ time ./refSeqIdSymbol &lt; hopla.gb 
-NM_001142483.1  NREP
-NM_001142481.1  NREP
-NM_001142480.1  NREP
-NM_001142477.1  NREP
-NM_001142475.1  NREP
-NM_001142474.1  NREP
-NM_004772.2 NREP
-NM_001142466.1  GPT2
-NM_133443.2 GPT2
-NM_173685.2 NSMCE2
-NM_007058.3 CAPN11
 
+curl --silent ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.4.rna.gbff.gz | gunzip | head -n 100000 &gt; refSeqIdSymbol.testdata.gb</code></pre>
+<pre><code>aqwaãHaskellã$ time cat refSeqIdSymbol.testdata.gb | grep -e VERSION -e gene= |   uniq |   sed &#39;s/=/ /&#39; |   awk &#39;{print $2}&#39; |   tr &#39;\n&#39; &#39;\t&#39; |   sed -e &#39;s/&quot;\t/\n/g&#39; -e &#39;s/&quot;//g&#39; | head
+NM_025073.2     SIKE1
+NM_181712.4     KANK4
+NM_152372.3     MYOM3
+NM_175066.3     DDX51
+NM_024541.2     C10orf76
+NM_015491.1     PNISR
+NM_032870.2     PNISR
+NM_182482.2     BAGE2
+NM_005355.3     KIF25
+NM_030615.2     KIF25
 
-real    0m0.701s
-user    0m0.664s
-sys 0m0.028s</code></pre>
-<pre><code>$ time cat hopla.gb | grep -e VERSION -e gene= |   uniq |   sed &#39;s/=/ /&#39; |   awk &#39;{print $2}&#39; |   tr &#39;\n&#39; &#39;\t&#39; |   sed -e &#39;s/&quot;\t/\n/g&#39; -e &#39;s/&quot;//g&#39;
-NM_001142483.1  NREP
-NM_001142481.1  NREP
-NM_001142480.1  NREP
-NM_001142477.1  NREP
-NM_001142475.1  NREP
-NM_001142474.1  NREP
-NM_004772.2 NREP
-NM_001142466.1  GPT2
-NM_133443.2 GPT2
-NM_173685.2 NSMCE2
-NM_007058.3 CAPN11
-
-real    0m0.015s
-user    0m0.004s
-sys 0m0.004s</code></pre>
+real    0m0.036s
+user    0m0.028s
+sys     0m0.012s</code></pre>
+<p>The Haskell parser is unfortunately 100 times slower.</p>
+<p><sub>~</sub>~~ $ time cat refSeqIdSymbol.testdata.gb | ./refSeqIdSymbol | head NM_025073.2 SIKE1 NM_181712.4 KANK4 NM_152372.3 MYOM3 NM_175066.3 DDX51 NM_024541.2 C10orf76 NM_015491.1 PNISR NM_032870.2 PNISR NM_182482.2 BAGE2 NM_005355.3 KIF25 NM_030615.2 KIF25</p>
+<p>real 0m4.963s user 0m4.808s sys 0m0.176s ~<sub>~</sub></p>
diff --git a/Haskell/refSeqIdSymbol.lhs b/Haskell/refSeqIdSymbol.lhs
index 09d28585..8a96c424 100644
--- a/Haskell/refSeqIdSymbol.lhs
+++ b/Haskell/refSeqIdSymbol.lhs
@@ -115,8 +115,6 @@ them.
 
 Parsec returns either an error message or the result of the parsing.
 
-TODO: how about printing always ?  What is the difference between pring and putStrLn ?
-
 > gbRecord = do
 >   fs <- many field
 >   return . intercalate "\t" $ filter (/= "") fs
@@ -218,47 +216,51 @@ In brief, it:
 Speed
 -----
 
-Unfortunately, the Haskell version is way too slow to process the full RefSeq
-data.  Here is a comparison using a test file of only 476 Kib.
+The Haskell version is way too slow to process the full RefSeq data.  Here is a
+comparison using a test file of only 100,000 lines (~4 megabytes).
 
 ~~~~~
 $ ghc -O2 refSeqIdSymbol.lhs
 [1 of 1] Compiling Main             ( refSeqIdSymbol.lhs, refSeqIdSymbol.o )
 Linking refSeqIdSymbol ...
-choucaâ~â$ time ./refSeqIdSymbol < hopla.gb 
-NM_001142483.1	NREP
-NM_001142481.1	NREP
-NM_001142480.1	NREP
-NM_001142477.1	NREP
-NM_001142475.1	NREP
-NM_001142474.1	NREP
-NM_004772.2	NREP
-NM_001142466.1	GPT2
-NM_133443.2	GPT2
-NM_173685.2	NSMCE2
-NM_007058.3	CAPN11
-
-
-real	0m0.701s
-user	0m0.664s
-sys	0m0.028s
+
+curl --silent ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.4.rna.gbff.gz | gunzip | head -n 100000 > refSeqIdSymbol.testdata.gb
 ~~~~~
 
 ~~~~~
-$ time cat hopla.gb | grep -e VERSION -e gene= |   uniq |   sed 's/=/ /' |   awk '{print $2}' |   tr '\n' '\t' |   sed -e 's/"\t/\n/g' -e 's/"//g'
-NM_001142483.1	NREP
-NM_001142481.1	NREP
-NM_001142480.1	NREP
-NM_001142477.1	NREP
-NM_001142475.1	NREP
-NM_001142474.1	NREP
-NM_004772.2	NREP
-NM_001142466.1	GPT2
-NM_133443.2	GPT2
-NM_173685.2	NSMCE2
-NM_007058.3	CAPN11
-
-real	0m0.015s
-user	0m0.004s
-sys	0m0.004s
+aqwaãHaskellã$ time cat refSeqIdSymbol.testdata.gb | grep -e VERSION -e gene= |   uniq |   sed 's/=/ /' |   awk '{print $2}' |   tr '\n' '\t' |   sed -e 's/"\t/\n/g' -e 's/"//g' | head
+NM_025073.2     SIKE1
+NM_181712.4     KANK4
+NM_152372.3     MYOM3
+NM_175066.3     DDX51
+NM_024541.2     C10orf76
+NM_015491.1     PNISR
+NM_032870.2     PNISR
+NM_182482.2     BAGE2
+NM_005355.3     KIF25
+NM_030615.2     KIF25
+
+real    0m0.036s
+user    0m0.028s
+sys     0m0.012s
+~~~~~
+
+The Haskell parser is unfortunately 100 times slower.
+
 ~~~~~
+$ time cat refSeqIdSymbol.testdata.gb | ./refSeqIdSymbol | head
+NM_025073.2     SIKE1
+NM_181712.4     KANK4
+NM_152372.3     MYOM3
+NM_175066.3     DDX51
+NM_024541.2     C10orf76
+NM_015491.1     PNISR
+NM_032870.2     PNISR
+NM_182482.2     BAGE2
+NM_005355.3     KIF25
+NM_030615.2     KIF25
+
+real    0m4.963s
+user    0m4.808s
+sys     0m0.176s
+~~~~
-- 
2.47.3