Inside F#

Brian's thoughts on F# and .NET

Monadic parser combinators, part four – source code

Posted by Brian on December 6, 2007

As I said at the end of the previous entry, the server wouldn’t accept a post that large, so I had to post the source code separately from the text.  So here is the code for part four.

using System;
using System.Linq;
using System.Collections.Generic;
using System.Text;
using Position = System.Int32;

namespace MPC_Part4
{
    public class ParserState
    {
        public readonly Position Position;
        public readonly string Input;
        public ParserState(Position position, string input) { this.Position = position; this.Input = input; }
        public override string ToString()
        {
            // place a '^' before the current location in the string; this is helpful when looking at
            // values inside the debugger
            return this.Input.Substring(0, this.Position) + "^" + this.Input.Substring(this.Position);
        }
    }

    public class ErrorInfo
    {
        public readonly Position Position;
        public readonly IEnumerable<string> Expectations;
        public readonly string Message;
        public ErrorInfo(Position position) : this(position, Enumerable.Empty<string>(), "unknown error") { }
        public ErrorInfo(Position position, IEnumerable<string> expectations, string message) { this.Position = position; this.Expectations = expectations; this.Message = message; }
        public ErrorInfo Merge(ErrorInfo other)
        {
            // Note that this.Position == other.Position holds true, except when one is a failed Try that looked ahead.
            // Since the most common idiom is Try(p1).Or(p2), we favor keeping position2 and message2, as p2 is 
            // likely to consume and thus be the only contributor to the final expectations.
            // But if Try(p1).Or(Try(p2)) fails, and both alternatives looked ahead different number of tokens, then
            // a single error message is bound to be confusing, reporting some wrong position or details.
            // This is an intrinsic difficulty when it comes to reporting errors for languages that do not admit 
            // predictive parsers, and it illustrates why Try() and backtracking should be used judiciously.
            return new ErrorInfo(other.Position, this.Expectations.Concat(other.Expectations), other.Message);
        }
        public ErrorInfo SetExpectation(string label)
        {
            if (string.IsNullOrEmpty(label))
                return new ErrorInfo(this.Position, Enumerable.Empty<string>(), this.Message);
            else
                return new ErrorInfo(this.Position, Parser.Cons(label, Enumerable.Empty<string>()), this.Message);
        }
        public override string ToString()
        {
            List<string> expectations = this.Expectations.ToList();
            StringBuilder result = new StringBuilder(string.Format("At position {0}, {1}", this.Position, this.Message));
            if (expectations.Count != 0)
            {
                result.Append(", ");
                if (expectations.Count == 1)
                {
                    result.Append("expected " + expectations[0]);
                }
                else
                {
                    result.Append("expected ");
                    for (int i = 0; i < expectations.Count - 2; ++i)
                    {
                        result.Append(expectations[i]);
                        result.Append(", ");
                    }
                    result.Append(expectations[expectations.Count - 2]);
                    result.Append(" or ");
                    result.Append(expectations[expectations.Count - 1]);
                }
            }
            return result.ToString();
        }
    }

    public class ParseResult<T>
    {
        public readonly bool Succeeded;
        public readonly ErrorInfo ErrorInfo;
        readonly T result;
        readonly ParserState remainingInput;
        public T Result
        {
            get { if (!this.Succeeded) throw new InvalidOperationException(); return this.result; }
        }
        public ParserState RemainingInput
        {
            get { if (!this.Succeeded) throw new InvalidOperationException(); return this.remainingInput; }
        }
        public ParseResult(T result, ParserState remainingInput, ErrorInfo errorInfo) 
        { 
            this.result = result;
            this.remainingInput = remainingInput;
            this.ErrorInfo = errorInfo;
            this.Succeeded = true;
        }
        public ParseResult(ErrorInfo errorInfo) { this.ErrorInfo = errorInfo; }
        public ParseResult<T> MergeError(ErrorInfo otherError)
        {
            if (this.Succeeded)
                return new ParseResult<T>(this.Result, this.RemainingInput, this.ErrorInfo.Merge(otherError));
            else
                return new ParseResult<T>(this.ErrorInfo.Merge(otherError));
        }
        public ParseResult<T> SetExpectation(string label)
        {
            if (this.Succeeded)
                return new ParseResult<T>(this.Result, this.RemainingInput, this.ErrorInfo.SetExpectation(label));
            else
                return new ParseResult<T>(this.ErrorInfo.SetExpectation(label));
        }
        public override bool Equals(object obj)
        {
            ParseResult<T> other = obj as ParseResult<T>;
            if (other == null)
                return false;
            if (!object.Equals(other.Result, this.Result))
                return false;
            if (!object.Equals(other.RemainingInput, this.RemainingInput))
                return false;
            if (!object.Equals(other.ErrorInfo, this.ErrorInfo))
                return false;
            return true;
        }
        public override int GetHashCode()
        {
            return this.Succeeded.GetHashCode();
        }
        public override string ToString()
        {
            if (this.Succeeded)
                return string.Format("Success: {0}", this.Result.ToString());
            else
                return string.Format("Failure: {0}", this.ErrorInfo.ToString());
        }
    }

    public class Consumed<T>
    {
        public readonly bool HasConsumedInput;
        public readonly ParseResult<T> ParseResult;
        public Consumed(bool hasConsumedInput, ParseResult<T> result) { this.HasConsumedInput = hasConsumedInput; this.ParseResult = result; }
        public Consumed<T> Tag(string label)
        {
            if (this.HasConsumedInput)
                return this;
            return new Consumed<T>(this.HasConsumedInput, this.ParseResult.SetExpectation(label));
        }
        public override string ToString()
        {
            return this.ParseResult.ToString();
        }
    }

    // representation type for parsers
    public delegate Consumed<T> P<T>(ParserState input);

    public static class Parser
    {
        // core functions that know the underlying parser representation
        // clients call p.Parse("someString") so as not to deal with implementation details like Consumed & ParseState
        public static ParseResult<T> Parse<T>(this P<T> p, string toParse)
        {
            return p(new ParserState(0, toParse)).ParseResult;
        }
        // Fail() is a parser that always fails with "message", without consuming any input
        public static P<T> Fail<T>(string message)
        {
            return input => new Consumed<T>(false, new ParseResult<T>(new ErrorInfo(input.Position, Enumerable.Empty<string>(), message)));
        }
        // Return(x) is a parser that always succeeds with result x, without consuming any input
        public static P<T> Return<T>(T x)
        {
            return input => new Consumed<T>(false, new ParseResult<T>(x, input, new ErrorInfo(input.Position)));
        }
        public static P<Func<T, T, T>> Return<T>(Func<T, T, T> f)
        {
            return Return<Func<T, T, T>>(f);
        }
        // p1.Then_(p2) runs p1, discards the result, then runs p2 on the remaining input
        public static P<U> Then_<T, U>(this P<T> p1, P<U> p2)
        {
            return p1.Then(dummy => p2);
        }
        // p1.Then(v => p2) runs p1, binds the result to variable v, then runs p2 on the remaining input
        public static P<U> Then<T, U>(this P<T> p1, Func<T, P<U>> f)
        {
            return input =>
            {
                Consumed<T> consumed1 = p1(input);
                if (consumed1.ParseResult.Succeeded)
                {
                    Consumed<U> consumed2 = f(consumed1.ParseResult.Result)(consumed1.ParseResult.RemainingInput);
                    return new Consumed<U>(consumed1.HasConsumedInput || consumed2.HasConsumedInput,
                                           consumed2.HasConsumedInput
                                               ? consumed2.ParseResult 
                                               : consumed2.ParseResult.MergeError(consumed1.ParseResult.ErrorInfo));
                }
                else
                {
                    return new Consumed<U>(consumed1.HasConsumedInput, new ParseResult<U>(consumed1.ParseResult.ErrorInfo));
                }
            };
        }
        // p1.Or(p2) tries p1, but if it fails without consuming input, runs p2 instead
        public static P<T> Or<T>(this P<T> p1, P<T> p2)
        {
            return input =>
            {
                Consumed<T> consumed1 = p1(input);
                if (consumed1.ParseResult.Succeeded || consumed1.HasConsumedInput)
                {
                    return consumed1;
                }
                else
                {
                    Consumed<T> consumed2 = p2(input);
                    if (consumed2.HasConsumedInput)
                        return consumed2;
                    return new Consumed<T>(consumed2.HasConsumedInput,
                        consumed2.ParseResult.MergeError(consumed1.ParseResult.ErrorInfo));
                }
            };
        }
        // Try(p) behaves like p, except if p fails, it pretends it hasn't consumed any input
        public static P<T> Try<T>(this P<T> p)
        {
            return input =>
            {
                Consumed<T> consumed = p(input);
                if (consumed.HasConsumedInput && !consumed.ParseResult.Succeeded)
                    return new Consumed<T>(false, consumed.ParseResult);
                else
                    return consumed;
            };
        }
        // p.Tag(label) makes it so if p fails without consuming input, the error "expected label" occurs
        public static P<T> Tag<T>(this P<T> p, string label)
        {
            return input => p(input).Tag(label);
        }
        // Sat(pred) succeeds parsing a character only if the character matches the predicate
        public static P<char> Sat(Predicate<char> pred)
        {
            return input =>
            {
                if (input.Position >= input.Input.Length)
                    return new Consumed<char>(false, new ParseResult<char>(new ErrorInfo(input.Position, Enumerable.Empty<string>(), "unexpected end of input")));
                else if (!pred(input.Input[input.Position]))
                    return new Consumed<char>(false, new ParseResult<char>(new ErrorInfo(input.Position, Enumerable.Empty<string>(), "unexpected character '" + input.Input[input.Position] + "'")));
                else
                    return new Consumed<char>(true, new ParseResult<char>(input.Input[input.Position], new ParserState(input.Position+1, input.Input), new ErrorInfo(input.Position+1)));
            };
        }
        // other handy parser functions
        // Item() consumes the first character of the input string and returns it as a result
        public static P<char> Item()
        {
            return Sat(c => true).Tag("any character");
        }
        // Letter parses a letter
        public static P<char> Letter = Sat(char.IsLetter).Tag("letter");
        // Digit parses a digit character
        public static P<char> Digit = Sat(char.IsDigit).Tag("digit");
        // DigitVal parses a digit character and returns the corresponding numeric value
        public static P<int> DigitVal = Sat(char.IsDigit).Then(c => Return((int)(c - '0')));
        // Literal(c) parses a particular character
        public static P<char> Literal(char c) { return Sat(x => x == c).Tag("character '"+c+"'"); }
        // Literal(s, x) parses a particular string, and returns a fixed result
        public static P<T> Literal<T>(string toParse, T result)
        {
            return LiteralHelper(toParse, result).Tag("literal string \"" + toParse + "\"");
        }
        static P<T> LiteralHelper<T>(string toParse, T result)
        {
            if (toParse == "")
                return Parser.Return(result);
            else
                return Literal(toParse[0]).Then_(LiteralHelper(toParse.Substring(1), result));
        }
        // Cons(x, rest) returns a new list like rest but with x appended to the front
        public static IEnumerable<T> Cons<T>(T x, IEnumerable<T> rest)
        {
            yield return x;
            foreach (T t in rest)
                yield return t;
        }
        // Many(p) parses with p zero or more times, returning the sequence of results
        public static P<IEnumerable<T>> Many<T>(P<T> p)
        {
            return Many1<T>(p).Or(Return(Enumerable.Empty<T>()));
        }
        // Many1(p) parses with p one or more times, returning the sequence of results
        public static P<IEnumerable<T>> Many1<T>(P<T> p)
        {
            return p.Then(x =>
                   Many(p).Then(xs =>
                   Return(Cons(x, xs))));
        }
        // one way to parse natural numbers
        public static P<int> Natural = DigitVal.Then(x => NaturalHelper(x)).Tag("natural number");
        static P<int> NaturalHelper(int x)
        {
            return DigitVal.Then(y =>
                   NaturalHelper(10 * x + y))
            .Or(Return(x));
        }
        // p.Chainl1(op) parses a grammar like "p (op p)*"
        // the result is the p results combined by the op results, left-associatively
        public static P<T> Chainl1<T>(this P<T> p, P<Func<T, T, T>> op)
        {
            return p.Then(x =>
                Chainl1Helper(x, p, op));
        }
        static P<T> Chainl1Helper<T>(T x, P<T> p, P<Func<T, T, T>> op)
        {
            return op.Then(f =>
                   p.Then(y =>
                   Chainl1Helper<T>(f(x, y), p, op)))
            .Or(Return(x));
        }
        // another way to parse naturals
        public static P<int> Nat = DigitVal.Chainl1(Return((int x, int y) => 10 * x + y)).Tag("natural number");
        // p.Chainr1(op) is just like p.Chainl1(op), except it's right-associative
        public static P<T> Chainr1<T>(this P<T> p, P<Func<T, T, T>> op)
        {
            return p.Then(x =>
                   (
                   op.Then(f =>
                   Chainr1(p, op).Then(y =>
                   Return(f(x, y))))
                   )
                   .Or(Return(x)));
        }
        // p.SepBy(sep) parses zero or more p, each separated by a sep, returning a list of the p results
        public static P<IEnumerable<T>> SepBy<T, U>(this P<T> p, P<U> sep)
        {
            return SepBy1(p, sep).Or(Return(Enumerable.Empty<T>()));
        }
        // p.SepBy(sep) parses one or more p, each separated by a sep, returning a list of the p results
        public static P<IEnumerable<T>> SepBy1<T, U>(this P<T> p, P<U> sep)
        {
            return p.Then(x =>
                SepByHelper(p, sep).Then(xs =>
                Return(Cons(x, xs))));
        }
        static P<IEnumerable<T>> SepByHelper<T, U>(P<T> p, P<U> sep)
        {
            return sep.Then_(SepBy1(p, sep))
                .Or(Return(Enumerable.Empty<T>()));
        }
        // p1.NotFollowedBy(p2,label) runs p1 and then looks ahead for p2, failing if p2 is there
        public static P<T> NotFollowedBy<T, U>(this P<T> p1, P<U> p2, string p2Label)
        {
            return p1.Then(result =>
                Try(p2.Then_(Fail<T>("unexpected " + p2Label))
                    .Or(Return(result))));
        }
        public static P<int> EndOfInput = Return(0).NotFollowedBy(Item(), "character").Tag("end of input");

        public static P<T> FullWordLiteral<T>(string s, T result)
        {
            return Try(Literal(s, result).NotFollowedBy(Letter, "letter"));
        }

        public static P<int> OneToNine =
            FullWordLiteral("one", 1).Or(
            FullWordLiteral("two", 2).Or(
            FullWordLiteral("three", 3).Or(
            FullWordLiteral("four", 4).Or(
            FullWordLiteral("five", 5).Or(
            FullWordLiteral("six", 6).Or(
            FullWordLiteral("seven", 7).Or(
            FullWordLiteral("eight", 8).Or(
            FullWordLiteral("nine", 9))))))))).Tag("english number 1-9");

        public static P<int> TenToNineteen =
            FullWordLiteral("ten", 10).Or(
            FullWordLiteral("eleven", 11).Or(
            FullWordLiteral("twelve", 12).Or(
            FullWordLiteral("thirteen", 13).Or(
            FullWordLiteral("fourteen", 14).Or(
            FullWordLiteral("fifteen", 15).Or(
            FullWordLiteral("sixteen", 16).Or(
            FullWordLiteral("seventeen", 17).Or(
            FullWordLiteral("eighteen", 18).Or(
            FullWordLiteral("nineteen", 19)))))))))).Tag("english number 10-19");

        public static P<int> HigherTens(string tenName, int tenValue)
        {
            return FullWordLiteral(tenName, tenValue).Then(tens =>
                   (
                       Sat(c => c == '-').Then_(
                       OneToNine.Then(ones =>
                       Parser.Return(tens + ones)))
                   ).Or(Parser.Return(tens)));
        }

        public static P<int> OneToNinetyNine =
            OneToNine
            .Or(TenToNineteen)
            .Or(HigherTens("twenty", 20))
            .Or(HigherTens("thirty", 30))
            .Or(HigherTens("forty", 40))
            .Or(HigherTens("fifty", 50))
            .Or(HigherTens("sixty", 60))
            .Or(HigherTens("seventy", 70))
            .Or(HigherTens("eighty", 80))
            .Or(HigherTens("ninety", 90)).Tag("english number 1-99");

        // various running examples from the blog
        static void AssertEqual<T>(T x, T y)
        {
            if (!object.Equals(x, y))
                throw new Exception("not equal");
        }
        static void AssertEqualEnumerable<T>(IEnumerable<T> x, IEnumerable<T> y)
        {
            List<T> xl = x.ToList();
            List<T> yl = y.ToList();
            AssertEqual(xl.Count, yl.Count);
            for (int i = 0; i < xl.Count; ++i)
                AssertEqual(xl[i], yl[i]);
        }

        static void Main(string[] args)
        {
            Console.WriteLine(Sat(c => c == 'x').Parse(""));
            Console.WriteLine(Sat(c => c == 'x').Parse("y"));
            Console.WriteLine(Sat(c => c == 'x').Parse("x"));
            Console.WriteLine();

            Console.WriteLine(Letter.Or(Digit).Parse("?"));
            P<string> word = Many1(Letter).Then(chars => Return(new string(chars.ToArray())));
            AssertEqual(word.Parse("hello world").Result, "hello");

            AssertEqual(Natural.Parse("256K").Result, 256);
            AssertEqual(Nat.Parse("256K").Result, 256);

            P<Func<int, int, int>> addOp = Literal('+').Then_(Return((int x, int y) => x + y))
                                     .Or(Literal('-').Then_(Return((int x, int y) => x - y)))
                                     .Tag("add/subtract op");
            P<int> simpleExpr = Natural.Chainl1(addOp);
            AssertEqual(simpleExpr.Parse("10+2+3-14").Result, 1);

            P<Func<int, int, int>> mulOp = Literal('*').Then_(Return((int x, int y) => x * y))
                                     .Or(Literal('/').Then_(Return((int x, int y) => x / y))).Tag("multiply/divide op");
            P<Func<int, int, int>> expOp = Literal('^').Then_(Return((int x, int y) => (int)Math.Pow(x, y))).Tag("exponentiation op");
            P<int> expr = null, term = null, factor = null, part = null, paren = null;
            paren = Literal('(').Then(dummy => // use dummy lambda to capture reference to mutable "expr"
                    expr.Then(e =>
                    Literal(')').Then_(
                    Return<int>(e))));
            part = Nat.Or(paren);
            factor = part.Chainr1(expOp);
            term = factor.Chainl1(mulOp);
            expr = term.Chainl1(addOp);
            AssertEqual(expr.Parse("(611-2^3^2+1)/10-5*2").Result, 0);
            Console.WriteLine(expr.Parse("(611-2^-3^2+1)/10-5*2"));
            Console.WriteLine(expr.Parse("(611-2^3^2.2+1)/10-5*2"));
            Console.WriteLine();

            P<IEnumerable<int>> listOfNums = Literal('[').Then_(
                                             Nat.SepBy(Literal(',')).Then(nats =>
                                             Literal(']').Then_(
                                             Return(nats))));
            AssertEqualEnumerable(listOfNums.Parse("[1,10,100]").Result, new int[] { 1, 10, 100 });
            AssertEqualEnumerable(listOfNums.Parse("[1]").Result, new int[] { 1 });
            AssertEqualEnumerable(listOfNums.Parse("[]").Result, new int[] { });
            Console.WriteLine(listOfNums.Parse("[1,2x"));
            Console.WriteLine();

            Console.WriteLine(Literal('1').NotFollowedBy(Literal('2'), "character '2'").Parse("12"));
            Console.WriteLine(Literal('1').Then_(EndOfInput).Parse("1"));
            Console.WriteLine(Literal('1').Then_(EndOfInput).Parse("12"));
            Console.WriteLine();

            Console.WriteLine(OneToNinetyNine.Parse("seven"));
            Console.WriteLine(OneToNinetyNine.Parse("twenty-seven"));
            Console.WriteLine(OneToNinetyNine.Parse("seventy-seven"));
            Console.WriteLine(OneToNinetyNine.Parse("so"));
            Console.WriteLine();

            Console.WriteLine("done, press a key");
            Console.ReadKey();
        }
    }
}
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

 
%d bloggers like this: