コメントの除去

{-# LANGUAGE EmptyDataDecls #-}
module Text.Decomment (CommentStyle(..), QuoteStyle(..), decomment) where

import Control.Arrow
import Data.List
import Data.Maybe

class CommentStyle c where
  commentLeadings  :: c -> [String]
  commentOpenings  :: c -> [String]
  commentClosing   :: c -> String -> String
  commentNestable  :: c -> Bool

class QuoteStyle q where
  quoteOpenings  :: q -> [Char]
  quoteClosing   :: q -> Char -> Char
  quoteEscape    :: q -> Char

splitWithPrefix :: Eq a => [a] -> [a] -> Maybe ([a],[a])
splitWithPrefix [] xs = Just ([],xs)
splitWithPrefix (p:ps) [email protected](x:xs)
  | p == x     = splitWithPrefix ps xs >>= return . ((x:) *** id)
  | otherwise  = Nothing

splitQuoted :: Eq a => a -> a -> [a] -> ([a], [a])
splitQuoted esc qm []   = ([],[])
splitQuoted esc qm (c:cs)
           | c == qm    = ([],cs)    
           | c == esc   = case cs of 
                            []     -> ([c],[])
                            c':cs' -> ((c:).(c':) *** id) $ splitQuoted esc qm cs'
           | otherwise  = ((c:) *** id) $ splitQuoted esc qm cs

decomment :: (CommentStyle s, QuoteStyle s) => s -> String -> String
decomment s ""         = ""
decomment s [email protected](c:cs) = case mapMaybe (flip splitWithPrefix ccs) $ commentLeadings s of
  (_,xs):_ -> decomment s $ snd $ break ('\n'==) xs
  []       -> case mapMaybe (flip splitWithPrefix ccs) $ commentOpenings s of
                (p,xs):_ -> decommentC s [p] xs
                []       -> if elem c (quoteOpenings s) then c:decommentQ s c cs 
                            else c:decomment s cs

decommentC :: (CommentStyle s, QuoteStyle s) => s -> [String] -> String -> String
decommentC s [] xs = ' ':decomment s xs
decommentC s [email protected](p:ps) [email protected](x:xs)
  | commentNestable s  = case splitWithPrefix (commentClosing s p) xxs of
      Just (_,ys) -> decommentC s ps ys
      Nothing     -> case mapMaybe (flip splitWithPrefix xxs) $ commentOpenings s of
                       [] -> decommentC s pps xs
                       (p',yys):_ -> decommentC s (p':pps) yys
  | otherwise   = case mapMaybe (splitWithPrefix (commentClosing s p)) $ tails xs of
      (_,ys):_ -> ' ':decomment s ys
      []       -> ""

decommentQ :: (CommentStyle s, QuoteStyle s) => s -> Char -> String -> String
decommentQ s c xs = case splitQuoted (quoteEscape s) (quoteClosing s c) xs of
                      (ys,zs) -> ys ++ '"':decomment s zs

nobsun_decomment参照

コメント対応read

read' :: Read a => String -> a
read' = read . uncomment

uncomment :: String -> String
uncomment ""                 = ""
uncomment ('-':'-':rs)       = case break ('\n'==) rs of (ss,ts) -> uncomment ts
uncomment ('{':'-':rs)       = uncommentC 1 rs
uncomment ('\'':'"':'\'':rs) = '\'':'"':'\'':uncomment rs
uncomment ('\\':'"':rs)      = '\\':'"':uncomment rs
uncomment ('"':rs)           = '"':uncommentQ rs
uncomment (c:rs)             = c : uncomment rs

uncommentQ :: String -> String
uncommentQ ""             = ""
uncommentQ ('\\':'"':rs)  = '\\':'"':uncommentQ rs
uncommentQ ('"':rs)       = '"': uncomment rs
uncommentQ (c:rs)         = c:uncommentQ rs

uncommentC :: Int -> String -> String
uncommentC _ ""            = ""
uncommentC 0 rs            = uncomment rs
uncommentC k ('-':'}':rs)  = uncommentC (k-1) rs
uncommentC k ('{':'-':rs)  = uncommentC (k+1) rs
uncommentC k (c:rs)        = uncommentC k rs

sample0 = "[1 -- hoge\n,2\n,3\n{- ,4 \n -}\n]"

実行例

*Main> putStrLn sample0
[1 -- hoge
,2
,3
{- ,4 
 -}
]
*Main> read' sample0 :: [Int]
[1,2,3]

正規表現置換

import Text.Regex

sub re s str = case matchRegexAll re str of
                  Just (b, _, a, _) -> b ++ s ++ a
                  _ -> str

gsub re s str = case matchRegexAll re str of
                  Just (b, _, a, _) -> b ++ s ++ gsub re s a
                  _ -> str

words4ApacheLog

前に挙げたdividWithQuoteByをつかうとApacheのcombined 形式の access.log の項目も分解できる。単純な空白区切りの words に

testdata = "127.0.0.1 - - [12/Dec/2003:16:07:36 +0900] \"GET /cgi-bin/kahua.cgi HTTP/1.1\" 200 2301 \"-\" \"Mozilla/5.0 (X11; U; Linux i686; ja-JP; rv:1.5) Gecko/20031127 Firebird/0.7\""

を食わせると

Main> mapM_ putStrLn $ words testdata
127.0.0.1
-
-
[12/Dec/2003:16:07:36
+0900]
"GET
/cgi-bin/kahua.cgi
HTTP/1.1"
200
2301
"-"
"Mozilla/5.0
(X11;
U;
Linux
i686;
ja-JP;
rv:1.5)
Gecko/20031127
Firebird/0.7"

でも、

words4ApacheLog :: String -> [String]
words4ApacheLog = dividWithQuoteBy [('\"', '\"'),('[',']')] isSpace

としておいて、これをくわせると。

Prelude> mapM_ putStrLn $ words4ApacheLog testdata
127.0.0.1
-
-
[12/Dec/2003:16:07:36 +0900]
"GET /cgi-bin/kahua.cgi HTTP/1.1"
200
2301
"-"
"Mozilla/5.0 (X11; U; Linux i686; ja-JP; rv:1.5) Gecko/20031127 Firebird/0.7"

つまり、dividWithQuoteBy の逆を書けば汎用 encoding 関数が書けるんですね – shelarcy

文字列の先頭から等しい文字列を抜き出す

rwikiより。ナイーブな実装

same_prefix xs ys = map fst $ takeWhile (uncurry (==)) $ zip xs ys

ちょっとだけはやいかも

same_prefix (x:xs) (y:ys) | x == y    = x : same_prefix xs ys
                          | otherwise = []
same_prefix _ _ = []

お約束の末尾再帰版

same_prefix = same_prefix_iter []
 where
  same_prefix_iter xiferp (x:xs) (y:ys) | x == y    = same_prefix_iter (x:xiferp) xs ys
                                        | otherwise = reverse xiferp
  same_prefix_iter xiferp _  _ = reverse xiferp

文字列から一致する部分文字列を抽出する

import List

main :: IO ()
main = getContents >>= mapM_ print . sameString

sameString :: String -> [(String, Int)]
sameString = sortBy (\(_,x) (_,y) -> compare y x)
           . map (rv . last)
           . groupBy similar
           . sort . map (rv . last)
           . groupBy similar
           . getPrefix . sort . tails

rv :: (String, Int) -> (String, Int)
rv (s, i) = (reverse s, i)

similar :: (String, Int) -> (String, Int) -> Bool
similar (s1, i1) (s2, i2)
    | i1 /= i2 = False
    | otherwise = and $ map (uncurry (==)) $ zip s1 s2

getPrefix :: [String] -> [(String, Int)]
getPrefix ss = concat $ snd $ mapAccumL f ([], []) ss

f :: (String, [Int]) -> String -> ((String, [Int]), [(String, Int)])
f (cs, xs) ss = ((ss, p), q)
  where (p, q) = g cs xs ss

g :: String -> [Int] -> String -> ([Int], [(String, Int)])
g (c:cs) (x:xs) (s:ss) | c == s = ((x+1):p, map (\(s, n) -> (c:s, n)) q)
  where (p, q) = g cs xs ss
g cs xs ss = (map (\_ -> 1) ss, zip (drop 1 $ inits cs) (takeWhile (1<) xs))

CSV

一年以上まえの「算譜の記」より。

「データ行を与えられたセパレータ文字(CSVの場合は’,’)で区切られた値を表現する文字列に分解する」関数。ただし、引用内のセパレータ文字はセパレータとしない。引用を表現する文字対(引用開始文字、引用終了文字)は複数与えることができるものとする。
dividWithQuoteBy :: [(Char,Char)] -> (Char -> Bool) -> String -> [String]
dividWithQuoteBy _ _ "" = [""]
dividWithQuoteBy qs p [email protected](c:cs)
 = if p c
     then "" : dividWithQuoteBy qs p cs
     else case break (pOr p q) ccs of
            (_,[]) -> [ccs]
            (xs,[email protected](y:ys)) 
              -> case lookup y qs of
                   Nothing -> xs : dividWithQuoteBy qs p ys
                   Just eq 
                     -> case break (eq ==) ys of
                          (_,[])   -> error ("Invalid format: no "++show eq)
                          (zs,w:ws)-> case dividWithQuoteBy qs p ws of
                                        []     -> [ccs]
                                        ts:uus -> (xs++[y]++zs++[w]++ts) : uus
   where q x = case lookup x qs of {Nothing -> False; _-> True}
         pOr f g x = if f x then True else g x
これを利用すると CSVフォーマットのデータ行を分解するのは例えば、
readCSVLine :: String -> [String]
readCSVLine = dividWithQuoteBy [('\"','\"'),('\'','\'')] (','==)

自分自身のソースコードをプリントするプログラム

main=putStr(x++show x);x="main=putStr(x++show x);x="

文字列 C が文字列 A, B から構成されているか

MLHaskell-jp:485より

文字列 C の要素を二つに分けたとき、それらが C 上の順序を保って A, B と等しいか。

f xs ys = elem (Just ([], [])) . foldl g [Just (xs, ys)]
  where
    g xy z = [ case (l, r) of (Just (x:xs, ys), _) | x == z -> Just (xs, ys)
                              (_, Just (xs, y:ys)) | y == z -> Just (xs, ys)
                              _ -> Nothing
             | (l, r) <- zip (xy++[Nothing]) (Nothing:xy)
             ]

Main> f "aaaaaaaaaa" "aaaaaaaaaab" "aaaaaaaaaaaaaaaaaaaaa"
False
(1732 reductions, 5108 cells)
Main> f "chocolate" "chips" "cchochiolpaste"
True
(532 reductions, 1625 cells)

末広がりにならないように斜めにみる。 – yts

check xs ys zs = last $ next ys zs $ True:zipWith (==) xs zs
  where
    next [] zs bs = if length zs == length xs then bs else [False]
    next (y:ys) (z:zs) (b:bs)
      = next ys zs $ fix (\bls ->
            (b && z == y):[ (bl && z == x) || (bd && z == y) 
                | (bd, (bl, (x, z))) <- zip bs $ zip bls $ zip xs zs])

Exploit lazyness! fibなんかもこういうふうにbuild DOWNできます。 – yts

data Tree a = T a (Tree a) (Tree a)
check xs ys zs = b
  where end = T False end end
        [email protected](T b _ _) = checks xs ys zs t
        checks [] [] [] _ = T True end end
        checks [] [] _ _ = end
        checks _ _ [] _ = end
        checks (x:xs) [] (z:zs) ~(T _ [email protected](T a al ar) _)
            = T (z == x && a) (checks xs [] zs l) (T False ar end)
        checks [] (y:ys) (z:zs) ~(T _ _ [email protected](T b bl br))
            = T (z == y && b) (T False end bl) (checks [] ys zs r)
        checks [email protected](x:xs) [email protected](y:ys) (z:zs) ~(T _ [email protected](T a al ar) [email protected](T b bl br))
            = T (z == x && a || z == y && b)
                  (checks xs yys zs (T a al ar))
                  (checks xxs ys zs (T b ar br)) -- "ar"

Last modified : 2008/06/19 03:30:16 JST

Old/sampou.org/Programming_玉手箱_文字列

Programming_玉手箱_文字列