0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
WordStream.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2007-2015 Hypertable, Inc.
3  *
4  * This file is part of Hypertable.
5  *
6  * Hypertable is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 3
9  * of the License, or any later version.
10  *
11  * Hypertable is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  */
21 
27 #include "Common/Compat.h"
28 
29 #include <cctype>
30 
31 extern "C" {
32 #include <sys/mman.h>
33 }
34 
35 #include "FileUtils.h"
36 #include "Properties.h"
37 
38 #include "WordStream.h"
39 
40 using namespace Hypertable;
41 
42 #define SKIP_SPACE \
43  while (ptr < m_end && isspace(*ptr)) \
44  ptr++;
45 
46 WordStream::WordStream(const String &word_file, unsigned seed,
47  size_t words_per_record, bool random, const char *separator)
48  : m_separator(separator), m_words_per_record(words_per_record),
49  m_random(random) {
50  if (seed)
51  ms_rng.seed((uint32_t)seed);
52 
53  if (!m_random)
54  m_offset.resize(words_per_record, 0);
55 
56  m_base = (char *)FileUtils::mmap(word_file, &m_len);
57  m_end = m_base + m_len;
58 
59  size_t count = 0;
60 
61  for (const char *ptr = m_base; ptr < m_end; ++ptr) {
62  if (*ptr == '\n')
63  count++;
64  }
65 
66  m_words.reserve(count);
67 
68  const char *ptr = m_base;
69  struct word_info wi;
70  SKIP_SPACE;
71  wi.word = ptr;
72  for (; ptr < m_end; ++ptr) {
73  if (isspace(*ptr)) {
74  count++;
75  wi.len = ptr-wi.word;
76  m_words.push_back(wi);
77  SKIP_SPACE;
78  wi.word = ptr;
79  }
80  }
81 }
82 
84  ::munmap((void *)m_base, m_len);
85 }
86 
87 const char *WordStream::next() {
88  std::uniform_int_distribution<> dist {0, (int)(m_words.size()-1)};
89  size_t offset;
90  m_record.clear();
91 
92  for (size_t i = 0; i < m_words_per_record; ++i) {
93  if (m_random) {
94  offset = dist(ms_rng);
95  }
96  else {
97  if (m_offset[i] == m_words.size()) {
98  m_offset[i] = offset = 0;
99  if (i < (m_words_per_record - 1))
100  m_offset[i + 1]++;
101  }
102  else if (i == 0)
103  offset = m_offset[i]++;
104  else
105  offset = m_offset[i];
106  }
107  m_record += String(m_words[offset].word, m_words[offset].len) + m_separator;
108  }
109 
110  m_record.resize(m_record.size() - 1);
111  return m_record.c_str();
112 }
113 
std::string String
A String is simply a typedef to std::string.
Definition: String.h:44
const char * m_separator
The separator, as specified by the user.
Definition: WordStream.h:85
const char * m_end
End pointer for the memory mapped file.
Definition: WordStream.h:82
A class generating a stream of words; the words are retrieved from a file and can be randomized...
Program options handling.
std::mt19937 ms_rng
Random number generator.
Definition: WordStream.h:76
size_t m_words_per_record
Words per record, as specified by the user.
Definition: WordStream.h:88
off_t m_len
Length of the memory mapped file.
Definition: WordStream.h:91
std::vector< struct word_info > m_words
All words from the mapped file.
Definition: WordStream.h:94
File system utility functions.
String m_record
The current string.
Definition: WordStream.h:100
char * m_base
Base pointer for the memory mapped file.
Definition: WordStream.h:79
Internal structure for a single word.
Definition: WordStream.h:70
Compatibility Macros for C/C++.
std::vector< size_t > m_offset
Helper for parsing the words.
Definition: WordStream.h:97
virtual ~WordStream()
Releases internal resources.
Definition: WordStream.cc:83
Hypertable definitions
static void * mmap(const String &fname, off_t *lenp)
Maps a full file into memory using mmap; the mapping will be released when the application terminates...
Definition: FileUtils.cc:343
#define SKIP_SPACE
Definition: WordStream.cc:42
const char * next()
Retrieves the next word, or an empty string if EOF is reached.
Definition: WordStream.cc:87
WordStream(const String &word_file, unsigned seed, size_t words_per_record, bool random=false, const char *separator=" ")
Constructor.
Definition: WordStream.cc:46
bool m_random
Whether to return random strings or not.
Definition: WordStream.h:103