public final class FST<T> extends java.lang.Object implements Accountable
The format is similar to what's used by Morfologik (https://github.com/morfologik/morfologik-stemming).
See the package
documentation
for some simple examples.
Modifier and Type | Class and Description |
---|---|
static class |
FST.Arc<T>
Represents a single arc.
|
static class |
FST.BytesReader
Reads bytes stored in an FST.
|
static class |
FST.INPUT_TYPE
Specifies allowed range of each int input label for
this FST.
|
Modifier and Type | Field and Description |
---|---|
private static long |
ARC_SHALLOW_RAM_BYTES_USED |
static byte |
ARCS_FOR_BINARY_SEARCH
Value of the arc flags to declare a node with fixed length arcs
designed for binary search.
|
(package private) static byte |
ARCS_FOR_DIRECT_ADDRESSING
Value of the arc flags to declare a node with fixed length arcs
and bit table designed for direct addressing.
|
private static long |
BASE_RAM_BYTES_USED |
private static int |
BIT_ARC_HAS_FINAL_OUTPUT |
static int |
BIT_ARC_HAS_OUTPUT
This flag is set if the arc has an output.
|
private static int |
BIT_FINAL_ARC |
(package private) static int |
BIT_LAST_ARC |
private static int |
BIT_STOP_NODE |
(package private) static int |
BIT_TARGET_NEXT |
(package private) BytesStore |
bytes
A
BytesStore , used during building, or during reading when
the FST is very large (more than 1 GB). |
private static int |
DEFAULT_MAX_BLOCK_BITS |
private static float |
DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR
Maximum oversizing factor allowed for direct addressing compared to binary search when expansion
credits allow the oversizing.
|
(package private) T |
emptyOutput |
static int |
END_LABEL
If arc has this label then that arc is final/accepted
|
private static java.lang.String |
FILE_FORMAT_NAME |
private static long |
FINAL_END_NODE |
(package private) static int |
FIXED_LENGTH_ARC_DEEP_NUM_ARCS |
(package private) static int |
FIXED_LENGTH_ARC_SHALLOW_DEPTH |
(package private) static int |
FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS |
private FSTStore |
fstStore |
(package private) FST.INPUT_TYPE |
inputType |
private static long |
NON_FINAL_END_NODE |
Outputs<T> |
outputs |
private long |
startNode |
private static int |
VERSION_CURRENT |
private static int |
VERSION_START |
Constructor and Description |
---|
FST(DataInput in,
Outputs<T> outputs)
Load a previously saved FST.
|
FST(DataInput in,
Outputs<T> outputs,
FSTStore fstStore)
Load a previously saved FST; maxBlockBits allows you to
control the size of the byte[] pages used to hold the FST bytes.
|
FST(FST.INPUT_TYPE inputType,
Outputs<T> outputs,
int bytesPageBits) |
Modifier and Type | Method and Description |
---|---|
(package private) long |
addNode(Builder<T> builder,
Builder.UnCompiledNode<T> nodeIn) |
private boolean |
assertPresenceBytesAreValid(FST.Arc<T> arc) |
FST.Arc<T> |
findTargetArc(int labelToMatch,
FST.Arc<T> follow,
FST.Arc<T> arc,
FST.BytesReader in)
Finds an arc leaving the incoming arc, replacing the arc in place.
|
(package private) void |
finish(long newStartNode) |
private static boolean |
flag(int flags,
int bit) |
FST.BytesReader |
getBytesReader()
Returns a
FST.BytesReader for this FST, positioned at
position 0. |
T |
getEmptyOutput() |
FST.Arc<T> |
getFirstArc(FST.Arc<T> arc)
Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node
|
private int |
getNumArcsDirectAddressing(FST.Arc<T> arc) |
private static int |
getNumPresenceBytes(int labelRange)
Gets the number of bytes required to flag the presence of each arc in the given label range, one bit per arc.
|
(package private) boolean |
isExpandedTarget(FST.Arc<T> follow,
FST.BytesReader in)
Returns whether
arc 's target points to a node in expanded format (fixed length arcs). |
long |
ramBytesUsed()
Return the memory usage of this object in bytes.
|
static <T> FST<T> |
read(java.nio.file.Path path,
Outputs<T> outputs)
Reads an automaton from a file.
|
private FST.Arc<T> |
readArc(FST.Arc<T> arc,
FST.BytesReader in)
Reads an arc.
|
FST.Arc<T> |
readArcByDirectAddressing(FST.Arc<T> arc,
FST.BytesReader in,
int rangeIndex)
Reads a present direct addressing node arc, with the provided index in the label range.
|
FST.Arc<T> |
readArcByIndex(FST.Arc<T> arc,
FST.BytesReader in,
int idx) |
(package private) static <T> FST.Arc<T> |
readEndArc(FST.Arc<T> follow,
FST.Arc<T> arc) |
FST.Arc<T> |
readFirstRealTargetArc(long nodeAddress,
FST.Arc<T> arc,
FST.BytesReader in) |
FST.Arc<T> |
readFirstTargetArc(FST.Arc<T> follow,
FST.Arc<T> arc,
FST.BytesReader in)
Follow the
follow arc and read the first arc of its target;
this changes the provided arc (2nd arg) in-place and returns
it. |
int |
readLabel(DataInput in)
Reads one BYTE1/2/4 label from the provided
DataInput . |
(package private) FST.Arc<T> |
readLastTargetArc(FST.Arc<T> follow,
FST.Arc<T> arc,
FST.BytesReader in)
Follows the
follow arc and reads the last
arc of its target; this changes the provided
arc (2nd arg) in-place and returns it. |
FST.Arc<T> |
readNextArc(FST.Arc<T> arc,
FST.BytesReader in)
In-place read; returns the arc.
|
(package private) int |
readNextArcLabel(FST.Arc<T> arc,
FST.BytesReader in)
Peeks at next arc's label; does not alter arc.
|
FST.Arc<T> |
readNextRealArc(FST.Arc<T> arc,
FST.BytesReader in)
Never returns null, but you should never call this if
arc.isLast() is true.
|
private int |
readPresenceBytes(FST.Arc<T> arc,
FST.BytesReader in)
Reads the presence bits of a direct-addressing node, store them in the provided arc
FST.Arc.bitTable()
and returns the number of presence bytes. |
private long |
readUnpackedNodeTarget(FST.BytesReader in) |
void |
save(DataOutput out) |
void |
save(java.nio.file.Path path)
Writes an automaton to a file.
|
private void |
seekToNextNode(FST.BytesReader in) |
(package private) void |
setEmptyOutput(T v) |
private boolean |
shouldExpandNodeWithDirectAddressing(Builder<T> builder,
Builder.UnCompiledNode<T> nodeIn,
int numBytesPerArc,
int maxBytesPerArcWithoutLabel,
int labelRange)
Returns whether the given node should be expanded with direct addressing instead of binary search.
|
private boolean |
shouldExpandNodeWithFixedLengthArcs(Builder<T> builder,
Builder.UnCompiledNode<T> node)
Returns whether the given node should be expanded with fixed length arcs.
|
static <T> boolean |
targetHasArcs(FST.Arc<T> arc)
returns true if the node at this address has any
outgoing arcs
|
java.lang.String |
toString() |
private void |
writeLabel(DataOutput out,
int v) |
private void |
writeNodeForBinarySearch(Builder<T> builder,
Builder.UnCompiledNode<T> nodeIn,
long startAddress,
int maxBytesPerArc) |
private void |
writeNodeForDirectAddressing(Builder<T> builder,
Builder.UnCompiledNode<T> nodeIn,
long startAddress,
int maxBytesPerArcWithoutLabel,
int labelRange) |
private void |
writePresenceBits(Builder<T> builder,
Builder.UnCompiledNode<T> nodeIn,
long dest,
int numPresenceBytes) |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
getChildResources
private static final long BASE_RAM_BYTES_USED
private static final long ARC_SHALLOW_RAM_BYTES_USED
private static final int BIT_FINAL_ARC
static final int BIT_LAST_ARC
static final int BIT_TARGET_NEXT
private static final int BIT_STOP_NODE
public static final int BIT_ARC_HAS_OUTPUT
private static final int BIT_ARC_HAS_FINAL_OUTPUT
public static final byte ARCS_FOR_BINARY_SEARCH
static final byte ARCS_FOR_DIRECT_ADDRESSING
static final int FIXED_LENGTH_ARC_SHALLOW_DEPTH
static final int FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS
static final int FIXED_LENGTH_ARC_DEEP_NUM_ARCS
private static final float DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR
private static final java.lang.String FILE_FORMAT_NAME
private static final int VERSION_START
private static final int VERSION_CURRENT
private static final long FINAL_END_NODE
private static final long NON_FINAL_END_NODE
public static final int END_LABEL
final FST.INPUT_TYPE inputType
T emptyOutput
final BytesStore bytes
BytesStore
, used during building, or during reading when
the FST is very large (more than 1 GB). If the FST is less than 1
GB then bytesArray is set instead.private final FSTStore fstStore
private long startNode
private static final int DEFAULT_MAX_BLOCK_BITS
FST(FST.INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits)
public FST(DataInput in, Outputs<T> outputs) throws java.io.IOException
java.io.IOException
private static boolean flag(int flags, int bit)
public long ramBytesUsed()
Accountable
ramBytesUsed
in interface Accountable
public java.lang.String toString()
toString
in class java.lang.Object
void finish(long newStartNode) throws java.io.IOException
java.io.IOException
public T getEmptyOutput()
void setEmptyOutput(T v)
public void save(DataOutput out) throws java.io.IOException
java.io.IOException
public void save(java.nio.file.Path path) throws java.io.IOException
java.io.IOException
public static <T> FST<T> read(java.nio.file.Path path, Outputs<T> outputs) throws java.io.IOException
java.io.IOException
private void writeLabel(DataOutput out, int v) throws java.io.IOException
java.io.IOException
public int readLabel(DataInput in) throws java.io.IOException
DataInput
.java.io.IOException
public static <T> boolean targetHasArcs(FST.Arc<T> arc)
long addNode(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws java.io.IOException
java.io.IOException
private boolean shouldExpandNodeWithFixedLengthArcs(Builder<T> builder, Builder.UnCompiledNode<T> node)
Nodes with fixed length arcs use more space, because they encode all arcs with a fixed number of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear scan) on lookup by arc label.
private boolean shouldExpandNodeWithDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange)
Prefer direct addressing for performance if it does not oversize binary search byte size too much, so that the arcs can be directly addressed by label.
private void writeNodeForBinarySearch(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc)
private void writeNodeForDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange)
private void writePresenceBits(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes)
private static int getNumPresenceBytes(int labelRange)
private int readPresenceBytes(FST.Arc<T> arc, FST.BytesReader in) throws java.io.IOException
FST.Arc.bitTable()
and returns the number of presence bytes.java.io.IOException
public FST.Arc<T> getFirstArc(FST.Arc<T> arc)
FST.Arc<T> readLastTargetArc(FST.Arc<T> follow, FST.Arc<T> arc, FST.BytesReader in) throws java.io.IOException
follow
arc and reads the last
arc of its target; this changes the provided
arc
(2nd arg) in-place and returns it.arc
).java.io.IOException
private long readUnpackedNodeTarget(FST.BytesReader in) throws java.io.IOException
java.io.IOException
public FST.Arc<T> readFirstTargetArc(FST.Arc<T> follow, FST.Arc<T> arc, FST.BytesReader in) throws java.io.IOException
follow
arc and read the first arc of its target;
this changes the provided arc
(2nd arg) in-place and returns
it.arc
).java.io.IOException
public FST.Arc<T> readFirstRealTargetArc(long nodeAddress, FST.Arc<T> arc, FST.BytesReader in) throws java.io.IOException
java.io.IOException
boolean isExpandedTarget(FST.Arc<T> follow, FST.BytesReader in) throws java.io.IOException
arc
's target points to a node in expanded format (fixed length arcs).java.io.IOException
public FST.Arc<T> readNextArc(FST.Arc<T> arc, FST.BytesReader in) throws java.io.IOException
java.io.IOException
int readNextArcLabel(FST.Arc<T> arc, FST.BytesReader in) throws java.io.IOException
java.io.IOException
public FST.Arc<T> readArcByIndex(FST.Arc<T> arc, FST.BytesReader in, int idx) throws java.io.IOException
java.io.IOException
public FST.Arc<T> readArcByDirectAddressing(FST.Arc<T> arc, FST.BytesReader in, int rangeIndex) throws java.io.IOException
rangeIndex
- The index of the arc in the label range. It must be present.
The real arc offset is computed based on the presence bits of
the direct addressing node.java.io.IOException
public FST.Arc<T> readNextRealArc(FST.Arc<T> arc, FST.BytesReader in) throws java.io.IOException
java.io.IOException
private FST.Arc<T> readArc(FST.Arc<T> arc, FST.BytesReader in) throws java.io.IOException
java.io.IOException
public FST.Arc<T> findTargetArc(int labelToMatch, FST.Arc<T> follow, FST.Arc<T> arc, FST.BytesReader in) throws java.io.IOException
java.io.IOException
private void seekToNextNode(FST.BytesReader in) throws java.io.IOException
java.io.IOException
public FST.BytesReader getBytesReader()
FST.BytesReader
for this FST, positioned at
position 0.