CheckLLM Competitor Benchmark

frameworkdatasetmetric_familyaucbest_f1spearmannmean_latency_mstotal_cost_usdrank
checkllmhalubenchhallucination0.78346417069243160.79620853080568720.5439989768236825200.02415.3850.034296451
deepevalhalubenchhallucination0.5533413848631240.70129870129870130.15094728675256455200.04456.8050.03
promptfoohalubenchhallucination0.75286835748792270.79130434782608690.5103141714087509200.01801.8250.0292480499999999982
deepevalragtruthcontext_relevance0.43485984998026060.8538681948424068-0.09953764028954455200.020571.630.03
promptfooragtruthcontext_relevance0.50.8538681948424068nan200.01363.540.04229582
checkllmragtruthcontext_relevance0.56454796683774190.85632183908045980.12519910668622805200.02350.9450.062260051
checkllmragtruthfaithfulness0.75417818133964990.86068111455108350.4235879167613457200.011877.50.061274851
deepevalragtruthfaithfulness0.63087248322147650.85386819484240680.2046464551960725200.017191.230.02
promptfooragtruthfaithfulness0.53408343203053040.85632183908045980.08964416068501872200.01692.7650.04405923
checkllmragtruthhallucination0.6631793657060140.87147335423197490.39820181540014465200.02728.2550.0442084499999999961
deepevalragtruthhallucination0.58797210159231480.86904761904761910.31107184538246846200.03669.4950.02
promptfooragtruthhallucination0.51302803000394790.85549132947976880.08111708188068653200.01602.0150.04411923
checkllmtruthfulqaanswer_relevancy0.545550.66666666666666660.08511435692920465400.06643.05750.0213200999999999981
deepevaltruthfulqaanswer_relevancy0.43756250.6666666666666666-0.12200569080495786400.030595.830.02
promptfootruthfulqaanswer_relevancy0.3922250.6666666666666666-0.23251317852311312400.01175.630.0246851999999999973