From 6a0a41825dec31e6fe1e3dba4c7e0ea7360b6617 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Sun, 21 Dec 2014 14:30:18 +0900 Subject: [PATCH] Removed TODO and refreshed benchmark. --- Haskell/refSeqIdSymbol.html | 52 ++++++++++--------------- Haskell/refSeqIdSymbol.lhs | 76 +++++++++++++++++++------------------ 2 files changed, 58 insertions(+), 70 deletions(-) diff --git a/Haskell/refSeqIdSymbol.html b/Haskell/refSeqIdSymbol.html index d5a22772..9b1bf274 100644 --- a/Haskell/refSeqIdSymbol.html +++ b/Haskell/refSeqIdSymbol.html @@ -52,7 +52,6 @@ parseGbRecord r = case parse gbR print e Right r -> putStrLn r

Parsec returns either an error message or the result of the parsing.

-

TODO: how about printing always ? What is the difference between pring and putStrLn ?

gbRecord = do
   fs <- many field
   return . intercalate "\t" $ filter (/= "") fs
@@ -104,40 +103,27 @@ separator = newline >> not
  • cleans up the remaining double quotes.

  • Speed

    -

    Unfortunately, the Haskell version is way too slow to process the full RefSeq data. Here is a comparison using a test file of only 476 Kib.

    +

    The Haskell version is way too slow to process the full RefSeq data. Here is a comparison using a test file of only 100,000 lines (~4 megabytes).

    $ ghc -O2 refSeqIdSymbol.lhs
     [1 of 1] Compiling Main             ( refSeqIdSymbol.lhs, refSeqIdSymbol.o )
     Linking refSeqIdSymbol ...
    -chouca⁅~⁆$ time ./refSeqIdSymbol < hopla.gb 
    -NM_001142483.1  NREP
    -NM_001142481.1  NREP
    -NM_001142480.1  NREP
    -NM_001142477.1  NREP
    -NM_001142475.1  NREP
    -NM_001142474.1  NREP
    -NM_004772.2 NREP
    -NM_001142466.1  GPT2
    -NM_133443.2 GPT2
    -NM_173685.2 NSMCE2
    -NM_007058.3 CAPN11
     
    +curl --silent ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.4.rna.gbff.gz | gunzip | head -n 100000 > refSeqIdSymbol.testdata.gb
    +
    aqwa『Haskell』$ time cat refSeqIdSymbol.testdata.gb | grep -e VERSION -e gene= |   uniq |   sed 's/=/ /' |   awk '{print $2}' |   tr '\n' '\t' |   sed -e 's/"\t/\n/g' -e 's/"//g' | head
    +NM_025073.2     SIKE1
    +NM_181712.4     KANK4
    +NM_152372.3     MYOM3
    +NM_175066.3     DDX51
    +NM_024541.2     C10orf76
    +NM_015491.1     PNISR
    +NM_032870.2     PNISR
    +NM_182482.2     BAGE2
    +NM_005355.3     KIF25
    +NM_030615.2     KIF25
     
    -real    0m0.701s
    -user    0m0.664s
    -sys 0m0.028s
    -
    $ time cat hopla.gb | grep -e VERSION -e gene= |   uniq |   sed 's/=/ /' |   awk '{print $2}' |   tr '\n' '\t' |   sed -e 's/"\t/\n/g' -e 's/"//g'
    -NM_001142483.1  NREP
    -NM_001142481.1  NREP
    -NM_001142480.1  NREP
    -NM_001142477.1  NREP
    -NM_001142475.1  NREP
    -NM_001142474.1  NREP
    -NM_004772.2 NREP
    -NM_001142466.1  GPT2
    -NM_133443.2 GPT2
    -NM_173685.2 NSMCE2
    -NM_007058.3 CAPN11
    -
    -real    0m0.015s
    -user    0m0.004s
    -sys 0m0.004s
    +real 0m0.036s +user 0m0.028s +sys 0m0.012s +

    The Haskell parser is unfortunately 100 times slower.

    +

    ~~~ $ time cat refSeqIdSymbol.testdata.gb | ./refSeqIdSymbol | head NM_025073.2 SIKE1 NM_181712.4 KANK4 NM_152372.3 MYOM3 NM_175066.3 DDX51 NM_024541.2 C10orf76 NM_015491.1 PNISR NM_032870.2 PNISR NM_182482.2 BAGE2 NM_005355.3 KIF25 NM_030615.2 KIF25

    +

    real 0m4.963s user 0m4.808s sys 0m0.176s ~~

    diff --git a/Haskell/refSeqIdSymbol.lhs b/Haskell/refSeqIdSymbol.lhs index 09d28585..8a96c424 100644 --- a/Haskell/refSeqIdSymbol.lhs +++ b/Haskell/refSeqIdSymbol.lhs @@ -115,8 +115,6 @@ them. Parsec returns either an error message or the result of the parsing. -TODO: how about printing always ? What is the difference between pring and putStrLn ? - > gbRecord = do > fs <- many field > return . intercalate "\t" $ filter (/= "") fs @@ -218,47 +216,51 @@ In brief, it: Speed ----- -Unfortunately, the Haskell version is way too slow to process the full RefSeq -data. Here is a comparison using a test file of only 476 Kib. +The Haskell version is way too slow to process the full RefSeq data. Here is a +comparison using a test file of only 100,000 lines (~4 megabytes). ~~~~~ $ ghc -O2 refSeqIdSymbol.lhs [1 of 1] Compiling Main ( refSeqIdSymbol.lhs, refSeqIdSymbol.o ) Linking refSeqIdSymbol ... -chouca⁅~⁆$ time ./refSeqIdSymbol < hopla.gb -NM_001142483.1 NREP -NM_001142481.1 NREP -NM_001142480.1 NREP -NM_001142477.1 NREP -NM_001142475.1 NREP -NM_001142474.1 NREP -NM_004772.2 NREP -NM_001142466.1 GPT2 -NM_133443.2 GPT2 -NM_173685.2 NSMCE2 -NM_007058.3 CAPN11 - - -real 0m0.701s -user 0m0.664s -sys 0m0.028s + +curl --silent ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.4.rna.gbff.gz | gunzip | head -n 100000 > refSeqIdSymbol.testdata.gb ~~~~~ ~~~~~ -$ time cat hopla.gb | grep -e VERSION -e gene= | uniq | sed 's/=/ /' | awk '{print $2}' | tr '\n' '\t' | sed -e 's/"\t/\n/g' -e 's/"//g' -NM_001142483.1 NREP -NM_001142481.1 NREP -NM_001142480.1 NREP -NM_001142477.1 NREP -NM_001142475.1 NREP -NM_001142474.1 NREP -NM_004772.2 NREP -NM_001142466.1 GPT2 -NM_133443.2 GPT2 -NM_173685.2 NSMCE2 -NM_007058.3 CAPN11 - -real 0m0.015s -user 0m0.004s -sys 0m0.004s +aqwa『Haskell』$ time cat refSeqIdSymbol.testdata.gb | grep -e VERSION -e gene= | uniq | sed 's/=/ /' | awk '{print $2}' | tr '\n' '\t' | sed -e 's/"\t/\n/g' -e 's/"//g' | head +NM_025073.2 SIKE1 +NM_181712.4 KANK4 +NM_152372.3 MYOM3 +NM_175066.3 DDX51 +NM_024541.2 C10orf76 +NM_015491.1 PNISR +NM_032870.2 PNISR +NM_182482.2 BAGE2 +NM_005355.3 KIF25 +NM_030615.2 KIF25 + +real 0m0.036s +user 0m0.028s +sys 0m0.012s +~~~~~ + +The Haskell parser is unfortunately 100 times slower. + ~~~~~ +$ time cat refSeqIdSymbol.testdata.gb | ./refSeqIdSymbol | head +NM_025073.2 SIKE1 +NM_181712.4 KANK4 +NM_152372.3 MYOM3 +NM_175066.3 DDX51 +NM_024541.2 C10orf76 +NM_015491.1 PNISR +NM_032870.2 PNISR +NM_182482.2 BAGE2 +NM_005355.3 KIF25 +NM_030615.2 KIF25 + +real 0m4.963s +user 0m4.808s +sys 0m0.176s +~~~~ -- 2.47.3